From 118cde131bb7c3d38f9834c0bdd7ffbcff836785 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Wed, 4 Feb 2026 23:39:03 +0000
Subject: [PATCH 01/26] feat: fork callgrind as tracegrind (mechanical rename)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pure copy of callgrind/ to tracegrind/ with symbol prefix rename
CLG_ → TG_ (expanding to vgTracegrind_), header guards updated,
public header renamed to tracegrind.h with TRACEGRIND_* macros.
No behavioral changes — output is still identical to callgrind.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 Makefile.am                  |    1 +
 configure.ac                 |    2 +
 tracegrind/Makefile.am       |   82 ++
 tracegrind/bb.c              |  344 ++++++
 tracegrind/bbcc.c            |  932 +++++++++++++++
 tracegrind/callstack.c       |  435 +++++++
 tracegrind/clo.c             |  700 +++++++++++
 tracegrind/context.c         |  330 ++++++
 tracegrind/costs.c           |   68 ++
 tracegrind/costs.h           |   55 +
 tracegrind/debug.c           |  460 ++++++++
 tracegrind/dump.c            | 1763 ++++++++++++++++++++++++++++
 tracegrind/events.c          |  505 ++++++++
 tracegrind/events.h          |  131 +++
 tracegrind/fn.c              |  787 +++++++++++++
 tracegrind/global.h          |  868 ++++++++++++++
 tracegrind/jumps.c           |  235 ++++
 tracegrind/main.c            | 2154 ++++++++++++++++++++++++++++++++++
 tracegrind/sim.c             | 1739 +++++++++++++++++++++++++++
 tracegrind/tests/Makefile.am |    3 +
 tracegrind/threads.c         |  456 +++++++
 tracegrind/tracegrind.h      |  131 +++
 22 files changed, 12181 insertions(+)
 create mode 100644 tracegrind/Makefile.am
 create mode 100644 tracegrind/bb.c
 create mode 100644 tracegrind/bbcc.c
 create mode 100644 tracegrind/callstack.c
 create mode 100644 tracegrind/clo.c
 create mode 100644 tracegrind/context.c
 create mode 100644 tracegrind/costs.c
 create mode 100644 tracegrind/costs.h
 create mode 100644 tracegrind/debug.c
 create mode 100644 tracegrind/dump.c
 create mode 100644 tracegrind/events.c
 create mode 100644 tracegrind/events.h
 create mode 100644 tracegrind/fn.c
 create mode 100644 tracegrind/global.h
 create mode 100644 tracegrind/jumps.c
 create mode 100644 tracegrind/main.c
 create mode 100644 tracegrind/sim.c
 create mode 100644 tracegrind/tests/Makefile.am
 create mode 100644 tracegrind/threads.c
 create mode 100644 tracegrind/tracegrind.h
diff --git a/Makefile.am b/Makefile.am
index 6c5b9f5b6..2cfe16d16 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -9,6 +9,7 @@ TOOLS = \
 		memcheck \
 		cachegrind \
 		callgrind \
+		tracegrind \
 		helgrind \
 		drd \
 		massif \
diff --git a/configure.ac b/configure.ac
index f3f3867ef..fcc1afea4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5807,6 +5807,8 @@ AC_CONFIG_FILES([
    callgrind/callgrind_annotate
    callgrind/callgrind_control
    callgrind/tests/Makefile
+   tracegrind/Makefile
+   tracegrind/tests/Makefile
    helgrind/Makefile
    helgrind/tests/Makefile
    drd/Makefile
diff --git a/tracegrind/Makefile.am b/tracegrind/Makefile.am
new file mode 100644
index 000000000..e23377779
--- /dev/null
+++ b/tracegrind/Makefile.am
@@ -0,0 +1,82 @@
+include $(top_srcdir)/Makefile.tool.am
+
+EXTRA_DIST =
+
+#----------------------------------------------------------------------------
+# Headers, etc
+#----------------------------------------------------------------------------
+
+pkginclude_HEADERS = tracegrind.h
+
+noinst_HEADERS = \
+	costs.h \
+	events.h \
+	global.h
+
+#----------------------------------------------------------------------------
+# tracegrind-<platform>
+#----------------------------------------------------------------------------
+
+noinst_PROGRAMS  = tracegrind-@VGCONF_ARCH_PRI@-@VGCONF_OS@
+if VGCONF_HAVE_PLATFORM_SEC
+noinst_PROGRAMS += tracegrind-@VGCONF_ARCH_SEC@-@VGCONF_OS@
+endif
+
+TRACEGRIND_SOURCES_COMMON = \
+	bb.c \
+	bbcc.c \
+	callstack.c \
+	clo.c \
+	context.c \
+	costs.c \
+	debug.c \
+	dump.c \
+	events.c \
+	fn.c \
+	jumps.c \
+	main.c \
+	sim.c \
+	threads.c
+
+# We sneakily include "cg_branchpred.c" and "cg_arch.c" from cachegrind
+TRACEGRIND_CFLAGS_COMMON = -I$(top_srcdir)/cachegrind
+
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES      = \
+	$(TRACEGRIND_SOURCES_COMMON)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS     = \
+	$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS       = $(LTO_CFLAGS) \
+	$(AM_CFLAGS_@VGCONF_PLATFORM_PRI_CAPS@) $(TRACEGRIND_CFLAGS_COMMON)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_DEPENDENCIES = \
+	$(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDADD        = \
+	$(TOOL_LDADD_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS      = \
+	$(TOOL_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LINK = \
+	$(top_builddir)/coregrind/link_tool_exe_@VGCONF_OS@ \
+	@VALT_LOAD_ADDRESS_PRI@ \
+	$(LINK) \
+	$(tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS) \
+	$(tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS)
+
+if VGCONF_HAVE_PLATFORM_SEC
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES      = \
+	$(TRACEGRIND_SOURCES_COMMON)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS     = \
+	$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS       = $(LTO_CFLAGS) \
+	$(AM_CFLAGS_@VGCONF_PLATFORM_SEC_CAPS@) $(TRACEGRIND_CFLAGS_COMMON)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_DEPENDENCIES = \
+	$(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDADD        = \
+	$(TOOL_LDADD_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS      = \
+	$(TOOL_LDFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \
+	$(top_builddir)/coregrind/link_tool_exe_@VGCONF_OS@ \
+	@VALT_LOAD_ADDRESS_SEC@ \
+	$(LINK) \
+	$(tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \
+	$(tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS)
+endif
diff --git a/tracegrind/bb.c b/tracegrind/bb.c
new file mode 100644
index 000000000..32f5a6c7c
--- /dev/null
+++ b/tracegrind/bb.c
@@ -0,0 +1,344 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                         bb.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Basic block (BB) operations                          ---*/
+/*------------------------------------------------------------*/
+
+/* BB hash, resizable */
+bb_hash bbs;
+
+void TG_(init_bb_hash)(void)
+{
+   Int i;
+
+   bbs.size    = 8437;
+   bbs.entries = 0;
+   bbs.table = (BB**) TG_MALLOC("cl.bb.ibh.1",
+                                 bbs.size * sizeof(BB*));
+
+   for (i = 0; i < bbs.size; i++) bbs.table[i] = NULL;
+}
+
+bb_hash* TG_(get_bb_hash)(void)
+{
+  return &bbs;
+}
+
+/* The hash stores BBs according to
+ * - ELF object (is 0 for code in anonymous mapping)
+ * - BB base as object file offset
+ */
+static __inline__
+UInt bb_hash_idx(obj_node* obj, PtrdiffT offset, UInt size)
+{
+  return (((Addr)obj) + offset) % size;
+}
+
+/* double size of bb table  */
+static
+void resize_bb_table(void)
+{
+    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
+    BB **new_table, *curr, *next;
+    UInt new_idx;
+
+    new_size  = 2* bbs.size +3;
+    new_table = (BB**) TG_MALLOC("cl.bb.rbt.1",
+                                  new_size * sizeof(BB*));
+ 
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+ 
+    for (i = 0; i < bbs.size; i++) {
+	if (bbs.table[i] == NULL) continue;
+ 
+	curr = bbs.table[i];
+	while (NULL != curr) {
+	    next = curr->next;
+
+	    new_idx = bb_hash_idx(curr->obj, curr->offset, new_size);
+
+	    curr->next = new_table[new_idx];
+	    new_table[new_idx] = curr;
+	    if (curr->next) {
+		conflicts1++;
+		if (curr->next->next)
+		    conflicts2++;
+	    }
+
+	    curr = next;
+	}
+    }
+
+    VG_(free)(bbs.table);
+
+
+    TG_DEBUG(0, "Resize BB Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+	     bbs.size, new_size,
+	     bbs.entries, conflicts1, conflicts2);
+
+    bbs.size  = new_size;
+    bbs.table = new_table;
+    TG_(stat).bb_hash_resizes++;
+}
+
+
+/**
+ * Allocate new BB structure (including space for event type list)
+ * Not initialized:
+ * - instr_len, cost_count, instr[]
+ */
+static BB* new_bb(obj_node* obj, PtrdiffT offset,
+		  UInt instr_count, UInt cjmp_count, Bool cjmp_inverted)
+{
+   BB* bb;
+   UInt idx, size;
+
+   /* check fill degree of bb hash table and resize if needed (>80%) */
+   bbs.entries++;
+   if (10 * bbs.entries / bbs.size > 8)
+       resize_bb_table();
+
+   size = sizeof(BB) + instr_count * sizeof(InstrInfo)
+                     + (cjmp_count+1) * sizeof(CJmpInfo);
+   bb = (BB*) TG_MALLOC("cl.bb.nb.1", size);
+   VG_(memset)(bb, 0, size);
+
+   bb->obj        = obj;
+   bb->offset     = offset;
+   
+   bb->instr_count = instr_count;
+   bb->cjmp_count  = cjmp_count;
+   bb->cjmp_inverted = cjmp_inverted;
+   bb->jmp         = (CJmpInfo*) &(bb->instr[instr_count]);
+   bb->instr_len   = 0;
+   bb->cost_count  = 0;
+   bb->sect_kind   = VG_(DebugInfo_sect_kind)(NULL, offset + obj->offset);
+   bb->fn          = 0;
+   bb->line        = 0;
+   bb->is_entry    = 0;
+   bb->bbcc_list   = 0;
+   bb->last_bbcc   = 0;
+
+   /* insert into BB hash table */
+   idx = bb_hash_idx(obj, offset, bbs.size);
+   bb->next = bbs.table[idx];
+   bbs.table[idx] = bb;
+
+   TG_(stat).distinct_bbs++;
+
+#if TG_ENABLE_DEBUG
+   TG_DEBUGIF(3) {
+     VG_(printf)("  new_bb (instr %u, jmps %u, inv %s) [now %d]: ",
+		 instr_count, cjmp_count,
+		 cjmp_inverted ? "yes":"no",
+		 TG_(stat).distinct_bbs);
+      TG_(print_bb)(0, bb);
+      VG_(printf)("\n");
+   }
+#endif
+
+   TG_(get_fn_node)(bb);
+
+   return bb;
+}
+
+
+/* get the BB structure for a BB start address */
+static __inline__
+BB* lookup_bb(obj_node* obj, PtrdiffT offset)
+{
+    BB* bb;
+    Int idx;
+
+    idx = bb_hash_idx(obj, offset, bbs.size);
+    bb = bbs.table[idx];
+
+    while(bb) {
+      if ((bb->obj == obj) && (bb->offset == offset)) break;
+      bb = bb->next;
+    }
+
+    TG_DEBUG(5, "  lookup_bb (Obj %s, off %#lx): %p\n",
+              obj->name, (UWord)offset, bb);
+    return bb;
+}
+
+static __inline__
+obj_node* obj_of_address(Addr addr)
+{
+  obj_node* obj;
+  DebugInfo* di;
+  PtrdiffT offset;
+
+  DiEpoch ep = VG_(current_DiEpoch)();
+  di = VG_(find_DebugInfo)(ep, addr);
+  obj = TG_(get_obj_node)( di );
+
+  /* Update symbol offset in object if remapped */
+  /* FIXME (or at least check this) 2008 Feb 19: 'offset' is
+     only correct for text symbols, not for data symbols */
+  offset = di ? VG_(DebugInfo_get_text_bias)(di):0;
+  if (obj->offset != offset) {
+      Addr start = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
+
+      TG_DEBUG(0, "Mapping changed for '%s': %#lx -> %#lx\n",
+		obj->name, obj->start, start);
+
+      /* Size should be the same, and offset diff == start diff */
+      TG_ASSERT( obj->size == (di ? VG_(DebugInfo_get_text_size)(di) : 0) );
+      TG_ASSERT( obj->start - start == obj->offset - offset );
+      obj->offset = offset;
+      obj->start = start;
+  }
+
+  return obj;
+}
+
+/* Get the BB structure for a BB start address.
+ * If the BB has to be created, the IRBB is needed to
+ * compute the event type list for costs, and seen_before is
+ * set to False. Otherwise, seen_before is set to True.
+ *
+ * BBs are never discarded. There are 2 cases where this function
+ * is called from TG_(instrument)() and a BB already exists:
+ * - The instrumented version was removed from Valgrinds TT cache
+ * - The ELF object of the BB was unmapped and mapped again.
+ *   This involves a possibly different address, but is handled by
+ *   looking up a BB keyed by (obj_node, file offset).
+ *
+ * bbIn==0 is possible for artificial BB without real code.
+ * Such a BB is created when returning to an unknown function.
+ */
+BB* TG_(get_bb)(Addr addr, IRSB* bbIn, /*OUT*/ Bool *seen_before)
+{
+  BB*   bb;
+  obj_node* obj;
+  UInt n_instrs, n_jmps;
+  Bool cjmp_inverted = False;
+
+  TG_DEBUG(5, "+ get_bb(BB %#lx)\n", addr);
+
+  obj = obj_of_address(addr);
+  bb = lookup_bb(obj, addr - obj->offset);
+
+  n_instrs = 0;
+  n_jmps = 0;
+  TG_(collectBlockInfo)(bbIn, &n_instrs, &n_jmps, &cjmp_inverted);
+
+  *seen_before = bb ? True : False;
+  if (*seen_before) {
+    if (bb->instr_count != n_instrs) {
+      VG_(message)(Vg_DebugMsg, 
+		   "ERROR: BB Retranslation Mismatch at BB %#lx\n", addr);
+      VG_(message)(Vg_DebugMsg,
+		   "  new: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
+		   obj->name, (UWord)obj->offset,
+		   addr - obj->offset, n_instrs);
+      VG_(message)(Vg_DebugMsg,
+		   "  old: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
+		   bb->obj->name, (UWord)bb->obj->offset,
+		   (UWord)bb->offset, bb->instr_count);
+      TG_ASSERT(bb->instr_count == n_instrs );
+    }
+    TG_ASSERT(bb->cjmp_count == n_jmps );
+    TG_(stat).bb_retranslations++;
+
+    TG_DEBUG(5, "- get_bb(BB %#lx): seen before.\n", addr);
+    return bb;
+  }
+
+  bb = new_bb(obj, addr - obj->offset, n_instrs, n_jmps, cjmp_inverted);
+
+  TG_DEBUG(5, "- get_bb(BB %#lx)\n", addr);
+
+  return bb;
+}
+
+/* Delete the BB info for the bb with unredirected entry-point
+   address 'addr'. */
+void TG_(delete_bb)(Addr addr)
+{
+    BB  *bb, *bp;
+    Int idx, size;
+
+    obj_node* obj = obj_of_address(addr);
+    PtrdiffT offset = addr - obj->offset;
+
+    idx = bb_hash_idx(obj, offset, bbs.size);
+    bb = bbs.table[idx];
+
+    /* bb points at the current bb under consideration, and bp is the
+       one before. */
+    bp = NULL;
+    while(bb) {
+      if ((bb->obj == obj) && (bb->offset == offset)) break;
+      bp = bb;
+      bb = bb->next;
+    }
+
+    if (bb == NULL) {
+	TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): NOT FOUND\n",
+		  obj->name, (UWord)offset);
+
+	/* we didn't find it.
+	 * this happens when tracegrinds instrumentation mode
+	 * was off at BB translation time, ie. no BB was created.
+	 */
+	return;
+    }
+
+    /* unlink it from hash table */
+
+    if (bp == NULL) {
+       /* we found the first one in the list. */
+       tl_assert(bb == bbs.table[idx]);
+       bbs.table[idx] = bb->next;
+    } else {
+       tl_assert(bb != bbs.table[idx]);
+       bp->next = bb->next;
+    }
+
+    TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): %p, BBCC head: %p\n",
+	      obj->name, (UWord)offset, bb, bb->bbcc_list);
+
+    if (bb->bbcc_list == 0) {
+	/* can be safely deleted */
+
+	/* Fill the block up with junk and then free it, so we will
+	   hopefully get a segfault if it is used again by mistake. */
+	size = sizeof(BB)
+	    + bb->instr_count * sizeof(InstrInfo)
+	    + (bb->cjmp_count+1) * sizeof(CJmpInfo);
+	VG_(memset)( bb, 0xAA, size );
+	TG_FREE(bb);
+	return;
+    }
+    TG_DEBUG(3, "  delete_bb: BB in use, can not free!\n");
+}
diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
new file mode 100644
index 000000000..af0210562
--- /dev/null
+++ b/tracegrind/bbcc.c
@@ -0,0 +1,932 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                       bbcc.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+#include "costs.h"
+
+#include "pub_tool_threadstate.h"
+
+/*------------------------------------------------------------*/
+/*--- BBCC operations                                      ---*/
+/*------------------------------------------------------------*/
+
+#define N_BBCC_INITIAL_ENTRIES  10437
+
+/* BBCC table (key is BB/Context), per thread, resizable */
+bbcc_hash current_bbccs;
+
+void TG_(init_bbcc_hash)(bbcc_hash* bbccs)
+{
+   Int i;
+
+   TG_ASSERT(bbccs != 0);
+
+   bbccs->size    = N_BBCC_INITIAL_ENTRIES;
+   bbccs->entries = 0;
+   bbccs->table = (BBCC**) TG_MALLOC("cl.bbcc.ibh.1",
+                                      bbccs->size * sizeof(BBCC*));
+
+   for (i = 0; i < bbccs->size; i++) bbccs->table[i] = NULL;
+}
+
+void TG_(copy_current_bbcc_hash)(bbcc_hash* dst)
+{
+  TG_ASSERT(dst != 0);
+
+  dst->size    = current_bbccs.size;
+  dst->entries = current_bbccs.entries;
+  dst->table   = current_bbccs.table;
+}
+
+bbcc_hash* TG_(get_current_bbcc_hash)(void)
+{
+  return &current_bbccs;
+}
+
+void TG_(set_current_bbcc_hash)(bbcc_hash* h)
+{
+  TG_ASSERT(h != 0);
+
+  current_bbccs.size    = h->size;
+  current_bbccs.entries = h->entries;
+  current_bbccs.table   = h->table;
+}
+
+/*
+ * Zero all costs of a BBCC
+ */
+void TG_(zero_bbcc)(BBCC* bbcc)
+{
+  Int i;
+  jCC* jcc;
+
+  TG_ASSERT(bbcc->cxt != 0);
+  TG_DEBUG(1, "  zero_bbcc: BB %#lx, Cxt %u "
+	   "(fn '%s', rec %u)\n", 
+	   bb_addr(bbcc->bb),
+	   bbcc->cxt->base_number + bbcc->rec_index,
+	   bbcc->cxt->fn[0]->name,
+	   bbcc->rec_index);
+
+  if ((bbcc->ecounter_sum ==0) &&
+      (bbcc->ret_counter ==0)) return;
+
+  for(i=0;i<bbcc->bb->cost_count;i++)
+    bbcc->cost[i] = 0;
+  for(i=0;i <= bbcc->bb->cjmp_count;i++) {
+    bbcc->jmp[i].ecounter = 0;
+    for(jcc=bbcc->jmp[i].jcc_list; jcc; jcc=jcc->next_from) {
+      TG_(init_cost)( TG_(sets).full, jcc->cost );
+      jcc->call_counter = 0;
+    }
+  }
+  bbcc->ecounter_sum = 0;
+  bbcc->ret_counter = 0;
+}
+
+
+
+void TG_(forall_bbccs)(void (*func)(BBCC*))
+{
+  BBCC *bbcc, *bbcc2;
+  int i, j;
+	
+  for (i = 0; i < current_bbccs.size; i++) {
+    if ((bbcc=current_bbccs.table[i]) == NULL) continue;
+    while (bbcc) {
+      /* every bbcc should have a rec_array */
+      TG_ASSERT(bbcc->rec_array != 0);
+
+      for(j=0;j<bbcc->cxt->fn[0]->separate_recursions;j++) {
+	if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
+
+	(*func)(bbcc2);
+      }
+      bbcc = bbcc->next;
+    }
+  }
+}
+
+
+/* All BBCCs for recursion level 0 are inserted into a
+ * thread specific hash table with key
+ * - address of BB structure (unique, as never freed)
+ * - current context (includes caller chain)
+ * BBCCs for other recursion levels are in bbcc->rec_array.
+ *
+ * The hash is used in setup_bb(), i.e. to find the cost
+ * counters to be changed in the execution of a BB.
+ */
+
+static __inline__
+UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
+{
+   TG_ASSERT(bb != 0);
+   TG_ASSERT(cxt != 0);
+
+   return ((Addr)bb + (Addr)cxt) % size;
+}
+ 
+
+/* Lookup for a BBCC in hash.
+ */ 
+static
+BBCC* lookup_bbcc(BB* bb, Context* cxt)
+{
+   BBCC* bbcc = bb->last_bbcc;
+   UInt  idx;
+
+   /* check LRU */
+   if (bbcc->cxt == cxt) {
+       if (!TG_(clo).separate_threads) {
+	   /* if we don't dump threads separate, tid doesn't have to match */
+	   return bbcc;
+       }
+       if (bbcc->tid == TG_(current_tid)) return bbcc;
+   }
+
+   TG_(stat).bbcc_lru_misses++;
+
+   idx = bbcc_hash_idx(bb, cxt, current_bbccs.size);
+   bbcc = current_bbccs.table[idx];
+   while (bbcc &&
+	  (bb      != bbcc->bb ||
+	   cxt     != bbcc->cxt)) {
+       bbcc = bbcc->next;
+   }
+   
+   TG_DEBUG(2,"  lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n",
+	    bb_addr(bb), cxt->base_number, cxt->fn[0]->name, 
+	    bbcc, bbcc ? bbcc->tid : 0);
+
+   TG_DEBUGIF(2)
+     if (bbcc) TG_(print_bbcc)(-2,bbcc);
+
+   return bbcc;
+}
+
+
+/* double size of hash table 1 (addr->BBCC) */
+static void resize_bbcc_hash(void)
+{
+    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
+    BBCC** new_table;
+    UInt new_idx;
+    BBCC *curr_BBCC, *next_BBCC;
+
+    new_size = 2*current_bbccs.size+3;
+    new_table = (BBCC**) TG_MALLOC("cl.bbcc.rbh.1",
+                                    new_size * sizeof(BBCC*));
+ 
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+ 
+    for (i = 0; i < current_bbccs.size; i++) {
+	if (current_bbccs.table[i] == NULL) continue;
+ 
+	curr_BBCC = current_bbccs.table[i];
+	while (NULL != curr_BBCC) {
+	    next_BBCC = curr_BBCC->next;
+
+	    new_idx = bbcc_hash_idx(curr_BBCC->bb,
+				    curr_BBCC->cxt,
+				    new_size);
+
+	    curr_BBCC->next = new_table[new_idx];
+	    new_table[new_idx] = curr_BBCC;
+	    if (curr_BBCC->next) {
+		conflicts1++;
+		if (curr_BBCC->next->next)
+		    conflicts2++;
+	    }
+
+	    curr_BBCC = next_BBCC;
+	}
+    }
+
+    VG_(free)(current_bbccs.table);
+
+
+    TG_DEBUG(0,"Resize BBCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+	     current_bbccs.size, new_size,
+	     current_bbccs.entries, conflicts1, conflicts2);
+
+    current_bbccs.size = new_size;
+    current_bbccs.table = new_table;
+    TG_(stat).bbcc_hash_resizes++;
+}
+
+
+static __inline
+BBCC** new_recursion(int size)
+{
+    BBCC** bbccs;
+    int i;
+
+    bbccs = (BBCC**) TG_MALLOC("cl.bbcc.nr.1", sizeof(BBCC*) * size);
+    for(i=0;i<size;i++)
+	bbccs[i] = 0;
+
+    TG_DEBUG(3,"  new_recursion(size %d): %p\n", size, bbccs);
+
+    return bbccs;
+}
+  
+
+/*
+ * Allocate a new BBCC
+ *
+ * Uninitialized:
+ * cxt, rec_index, rec_array, next_bbcc, next1, next2
+ */
+static __inline__ 
+BBCC* new_bbcc(BB* bb)
+{
+   BBCC* bbcc;
+   Int i;
+
+   /* We need cjmp_count+1 JmpData structs:
+    * the last is for the unconditional jump/call/ret at end of BB
+    */
+   bbcc = (BBCC*)TG_MALLOC("cl.bbcc.nb.1",
+			    sizeof(BBCC) +
+			    (bb->cjmp_count+1) * sizeof(JmpData));
+   bbcc->bb  = bb;
+   bbcc->tid = TG_(current_tid);
+
+   bbcc->ret_counter = 0;
+   bbcc->skipped = 0;
+   bbcc->cost = TG_(get_costarray)(bb->cost_count);
+   for(i=0;i<bb->cost_count;i++)
+     bbcc->cost[i] = 0;
+   for(i=0; i<=bb->cjmp_count; i++) {
+       bbcc->jmp[i].ecounter = 0;
+       bbcc->jmp[i].jcc_list = 0;
+   }
+   bbcc->ecounter_sum = 0;
+
+   /* Init pointer caches (LRU) */
+   bbcc->lru_next_bbcc = 0;
+   bbcc->lru_from_jcc  = 0;
+   bbcc->lru_to_jcc  = 0;
+   
+   TG_(stat).distinct_bbccs++;
+
+   TG_DEBUG(3, "  new_bbcc(BB %#lx): %p (now %d)\n",
+	    bb_addr(bb), bbcc, TG_(stat).distinct_bbccs);
+
+   return bbcc;
+}
+
+
+/**
+ * Inserts a new BBCC into hashes.
+ * BBCC specific items must be set as this is used for the hash
+ * keys:
+ *  fn     : current function
+ *  tid    : current thread ID
+ *  from   : position where current function is called from
+ *
+ * Recursion level doesn't need to be set as this is not included
+ * in the hash key: Only BBCCs with rec level 0 are in hashes.
+ */
+static
+void insert_bbcc_into_hash(BBCC* bbcc)
+{
+    UInt idx;
+    
+    TG_ASSERT(bbcc->cxt != 0);
+
+    TG_DEBUG(3,"+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n",
+	     bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name);
+
+    /* check fill degree of hash and resize if needed (>90%) */
+    current_bbccs.entries++;
+    if (100 * current_bbccs.entries / current_bbccs.size > 90)
+	resize_bbcc_hash();
+
+    idx = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size);
+    bbcc->next = current_bbccs.table[idx];
+    current_bbccs.table[idx] = bbcc;
+
+    TG_DEBUG(3,"- insert_bbcc_into_hash: %u entries\n",
+	     current_bbccs.entries);
+}
+
+/* String is returned in a dynamically allocated buffer. Caller is
+   responsible for free'ing it. */
+static HChar* mangled_cxt(const Context* cxt, Int rec_index)
+{
+    Int i, p;
+
+    if (!cxt) return VG_(strdup)("cl.bbcc.mcxt", "(no context)");
+
+    /* Overestimate the number of bytes we need to hold the string. */
+    SizeT need = 20;   // rec_index + nul-terminator
+    for (i = 0; i < cxt->size; ++i)
+       need += VG_(strlen)(cxt->fn[i]->name) + 1;   // 1 for leading '
+
+    HChar *mangled = TG_MALLOC("cl.bbcc.mcxt", need);
+    p = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
+    if (rec_index >0)
+	p += VG_(sprintf)(mangled+p, "'%d", rec_index +1);
+    for(i=1;i<cxt->size;i++)
+	p += VG_(sprintf)(mangled+p, "'%s", cxt->fn[i]->name);
+
+    return mangled;
+}
+
+
+/* Create a new BBCC as a copy of an existing one,
+ * but with costs set to 0 and jcc chains empty.
+ *
+ * This is needed when a BB is executed in another context than
+ * the one at instrumentation time of the BB.
+ *
+ * Use cases:
+ *  rec_index == 0: clone from a BBCC with differing tid/cxt
+ *                  and insert into hashes
+ *  rec_index >0  : clone from a BBCC with same tid/cxt and rec_index 0
+ *                  don't insert into hashes
+ */
+static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
+{
+    BBCC* bbcc;
+
+    TG_DEBUG(3,"+ clone_bbcc(BB %#lx, rec %d, fn %s)\n",
+	     bb_addr(orig->bb), rec_index, cxt->fn[0]->name);
+
+    bbcc = new_bbcc(orig->bb);
+
+    if (rec_index == 0) {
+
+      /* hash insertion is only allowed if tid or cxt is different */
+      TG_ASSERT((orig->tid != TG_(current_tid)) ||
+		(orig->cxt != cxt));
+
+      bbcc->rec_index = 0;
+      bbcc->cxt = cxt;
+      bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions);
+      bbcc->rec_array[0] = bbcc;
+
+      insert_bbcc_into_hash(bbcc);
+    }
+    else {
+      if (TG_(clo).separate_threads)
+	TG_ASSERT(orig->tid == TG_(current_tid));
+
+      TG_ASSERT(orig->cxt == cxt);
+      TG_ASSERT(orig->rec_array);
+      TG_ASSERT(cxt->fn[0]->separate_recursions > rec_index);
+      TG_ASSERT(orig->rec_array[rec_index] ==0);
+
+      /* new BBCC will only have differing recursion level */
+      bbcc->rec_index = rec_index;
+      bbcc->cxt = cxt;
+      bbcc->rec_array = orig->rec_array;
+      bbcc->rec_array[rec_index] = bbcc;
+    }
+
+    /* update list of BBCCs for same BB */
+    bbcc->next_bbcc = orig->bb->bbcc_list;
+    orig->bb->bbcc_list = bbcc;
+
+
+    TG_DEBUGIF(3)
+      TG_(print_bbcc)(-2, bbcc);
+
+    HChar *mangled_orig = mangled_cxt(orig->cxt, orig->rec_index);
+    HChar *mangled_bbcc = mangled_cxt(bbcc->cxt, bbcc->rec_index);
+    TG_DEBUG(2,"- clone_BBCC(%p, %d) for BB %#lx\n"
+		"   orig %s\n"
+		"   new  %s\n",
+	     orig, rec_index, bb_addr(orig->bb),
+             mangled_orig,
+             mangled_bbcc);
+    TG_FREE(mangled_orig);
+    TG_FREE(mangled_bbcc);
+
+    TG_(stat).bbcc_clones++;
+ 
+    return bbcc;
+};
+
+
+
+/* Get a pointer to the cost centre structure for given basic block
+ * address. If created, the BBCC is inserted into the BBCC hash.
+ * Also sets BB_seen_before by reference.
+ *
+ */ 
+BBCC* TG_(get_bbcc)(BB* bb)
+{
+   BBCC* bbcc;
+
+   TG_DEBUG(3, "+ get_bbcc(BB %#lx)\n", bb_addr(bb));
+
+   bbcc = bb->bbcc_list;
+
+   if (!bbcc) {
+     bbcc = new_bbcc(bb);
+
+     /* initialize BBCC */
+     bbcc->cxt       = 0;
+     bbcc->rec_array = 0;
+     bbcc->rec_index = 0;
+
+     bbcc->next_bbcc = bb->bbcc_list;
+     bb->bbcc_list = bbcc;
+     bb->last_bbcc = bbcc;
+
+     TG_DEBUGIF(3)
+       TG_(print_bbcc)(-2, bbcc);
+   }
+
+   TG_DEBUG(3, "- get_bbcc(BB %#lx): BBCC %p\n",
+		bb_addr(bb), bbcc);
+
+   return bbcc;
+}
+
+
+/* Tracegrind manages its own call stack for each thread.
+ * When leaving a function, a underflow can happen when
+ * Tracegrind's tracing was switched on in the middle of
+ * a run, i.e. when Tracegrind was not able to trace the
+ * call instruction.
+ * This function tries to reconstruct the original call.
+ * As we know the return address (the address following
+ * the CALL instruction), we can detect the function
+ * we return back to, but the original call site is unknown.
+ * We suppose a call site at return address - 1.
+ * (TODO: other heuristic: lookup info of instrumented BBs).
+ */
+static void handleUnderflow(BB* bb)
+{
+  /* RET at top of call stack */
+  BBCC* source_bbcc;
+  BB* source_bb;
+  Bool seen_before;
+  fn_node* caller;
+  int fn_number;
+  unsigned *pactive;
+  call_entry* call_entry_up;
+
+  TG_DEBUG(1,"  Callstack underflow !\n");
+
+  /* we emulate an old call from the function we return to
+   * by using (<return address> -1) */
+  source_bb = TG_(get_bb)(bb_addr(bb)-1, 0, &seen_before);
+  source_bbcc = TG_(get_bbcc)(source_bb);
+
+  /* seen_before can be true if RET from a signal handler */
+  if (!seen_before) {
+    source_bbcc->ecounter_sum = TG_(current_state).collect ? 1 : 0;
+  }
+  else if (TG_(current_state).collect)
+    source_bbcc->ecounter_sum++;
+  
+  /* Force a new top context, will be set active by push_cxt() */
+  TG_(current_fn_stack).top--;
+  TG_(current_state).cxt = 0;
+  caller = TG_(get_fn_node)(bb);
+  TG_(push_cxt)( caller );
+
+  if (!seen_before) {
+    /* set rec array for source BBCC: this is at rec level 1 */
+    source_bbcc->rec_array = new_recursion(caller->separate_recursions);
+    source_bbcc->rec_array[0] = source_bbcc;
+
+    TG_ASSERT(source_bbcc->cxt == 0);
+    source_bbcc->cxt = TG_(current_state).cxt;
+    insert_bbcc_into_hash(source_bbcc);
+  }
+  TG_ASSERT(TG_(current_state).bbcc);
+
+  /* correct active counts */
+  fn_number = TG_(current_state).bbcc->cxt->fn[0]->number;
+  pactive = TG_(get_fn_entry)(fn_number);
+  (*pactive)--;
+
+  /* This assertion is not correct for reentrant
+   * signal handlers */
+  /* TG_ASSERT(*pactive == 0); */
+
+  TG_(current_state).nonskipped = 0; /* we didn't skip this function */
+  /* back to current context */
+  TG_(push_cxt)( TG_(current_state).bbcc->cxt->fn[0] );
+  TG_(push_call_stack)(source_bbcc, 0, TG_(current_state).bbcc,
+		       (Addr)-1, False);
+  call_entry_up = 
+    &(TG_(current_call_stack).entry[TG_(current_call_stack).sp -1]);
+  /* assume this call is lasting since last dump or
+   * for a signal handler since it's call */
+  if (TG_(current_state).sig == 0)
+    TG_(copy_cost)( TG_(sets).full, call_entry_up->enter_cost,
+		    TG_(get_current_thread)()->lastdump_cost );
+  else
+    TG_(zero_cost)( TG_(sets).full, call_entry_up->enter_cost );
+}
+
+
+/*
+ * Helper function called at start of each instrumented BB to setup
+ * pointer to costs for current thread/context/recursion level
+ */
+
+VG_REGPARM(1)
+void TG_(setup_bbcc)(BB* bb)
+{
+  BBCC *bbcc, *last_bbcc;
+  Bool  call_emulation = False, delayed_push = False, skip = False;
+  Addr sp;
+  BB* last_bb;
+  ThreadId tid;
+  TgJumpKind jmpkind;
+  Bool isConditionalJump;
+  Int passed = 0, csp;
+  Bool ret_without_call = False;
+  Int popcount_on_return = 1;
+
+  TG_DEBUG(3,"+ setup_bbcc(BB %#lx)\n", bb_addr(bb));
+
+  /* This is needed because thread switches can not reliable be tracked
+   * with callback TG_(run_thread) only: we have otherwise no way to get
+   * the thread ID after a signal handler returns.
+   * This could be removed again if that bug is fixed in Valgrind.
+   * This is in the hot path but hopefully not to costly.
+   */
+  tid = VG_(get_running_tid)();
+#if 1
+  /* TG_(switch_thread) is a no-op when tid is equal to TG_(current_tid).
+   * As this is on the hot path, we only call TG_(switch_thread)(tid)
+   * if tid differs from the TG_(current_tid).
+   */
+  if (UNLIKELY(tid != TG_(current_tid)))
+     TG_(switch_thread)(tid);
+#else
+  TG_ASSERT(VG_(get_running_tid)() == TG_(current_tid));
+#endif
+
+  sp = VG_(get_SP)(tid);
+  last_bbcc = TG_(current_state).bbcc;
+  last_bb = last_bbcc ? last_bbcc->bb : 0;
+
+  if (last_bb) {
+      passed = TG_(current_state).jmps_passed;
+      TG_ASSERT(passed <= last_bb->cjmp_count);
+      jmpkind = last_bb->jmp[passed].jmpkind;
+      isConditionalJump = (passed < last_bb->cjmp_count);
+
+      if (TG_(current_state).collect) {
+	if (!TG_(current_state).nonskipped) {
+	  last_bbcc->ecounter_sum++;
+	  last_bbcc->jmp[passed].ecounter++;
+	  if (!TG_(clo).simulate_cache) {
+	      /* update Ir cost */              
+              UInt instr_count = last_bb->jmp[passed].instr+1;
+              TG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
+	  }
+	}
+	else {
+	  /* do not increment exe counter of BBs in skipped functions, as it
+	   * would fool dumping code */
+	  if (!TG_(clo).simulate_cache) {
+	      /* update Ir cost */
+              UInt instr_count = last_bb->jmp[passed].instr+1;
+              TG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
+              TG_(current_state).nonskipped->skipped[ fullOffset(EG_IR) ]
+		+= instr_count;
+	  }
+	}
+      }
+
+      TG_DEBUGIF(4) {
+	  TG_(print_execstate)(-2, &TG_(current_state) );
+	  TG_(print_bbcc_cost)(-2, last_bbcc);
+      }
+  }
+  else {
+      jmpkind = jk_None;
+      isConditionalJump = False;
+  }
+
+  /* Manipulate JmpKind if needed, only using BB specific info */
+
+  csp = TG_(current_call_stack).sp;
+
+  /* A return not matching the top call in our callstack is a jump */
+  if ( (jmpkind == jk_Return) && (csp >0)) {
+      Int csp_up = csp-1;      
+      call_entry* top_ce = &(TG_(current_call_stack).entry[csp_up]);
+
+      /* We have a real return if
+       * - the stack pointer (SP) left the current stack frame, or
+       * - SP has the same value as when reaching the current function
+       *   and the address of this BB is the return address of last call
+       *   (we even allow to leave multiple frames if the SP stays the
+       *    same and we find a matching return address)
+       * The latter condition is needed because on PPC, SP can stay
+       * the same over CALL=b(c)l / RET=b(c)lr boundaries
+       */
+      if (sp < top_ce->sp) popcount_on_return = 0;
+      else if (top_ce->sp == sp) {
+	  while(1) {
+	      if (top_ce->ret_addr == bb_addr(bb)) break;
+	      if (csp_up>0) {
+		  csp_up--;
+		  top_ce = &(TG_(current_call_stack).entry[csp_up]);
+		  if (top_ce->sp == sp) {
+		      popcount_on_return++;
+		      continue; 
+		  }
+	      }
+	      popcount_on_return = 0;
+	      break;
+	  }
+      }
+      if (popcount_on_return == 0) {
+	  jmpkind = jk_Jump;
+	  ret_without_call = True;
+      }
+  }
+
+  /* Should this jump be converted to call or pop/call ? */
+  if (( jmpkind != jk_Return) &&
+      ( jmpkind != jk_Call) && last_bb) {
+
+    /* We simulate a JMP/Cont to be a CALL if
+     * - jump is in another ELF object or section kind
+     * - jump is to first instruction of a function (tail recursion)
+     */
+    if (ret_without_call ||
+	/* This is for detection of optimized tail recursion.
+	 * On PPC, this is only detected as call when going to another
+	 * function. The problem is that on PPC it can go wrong
+	 * more easily (no stack frame setup needed)
+	 */
+#if defined(VGA_ppc32)
+	(bb->is_entry && (last_bb->fn != bb->fn)) ||
+#else
+	bb->is_entry ||
+#endif
+	(last_bb->sect_kind != bb->sect_kind) ||
+	(last_bb->obj->number != bb->obj->number)) {
+
+	TG_DEBUG(1,"     JMP: %s[%s] to %s[%s]%s!\n",
+		  last_bb->fn->name, last_bb->obj->name,
+		  bb->fn->name, bb->obj->name,
+		  ret_without_call?" (RET w/o CALL)":"");
+
+	if (TG_(get_fn_node)(last_bb)->pop_on_jump && (csp>0)) {
+
+	    call_entry* top_ce = &(TG_(current_call_stack).entry[csp-1]);
+	    
+	    if (top_ce->jcc) {
+
+		TG_DEBUG(1,"     Pop on Jump!\n");
+
+		/* change source for delayed push */
+		TG_(current_state).bbcc = top_ce->jcc->from;
+		sp = top_ce->sp;
+		passed = top_ce->jcc->jmp;
+		TG_(pop_call_stack)();
+	    }
+	    else {
+		TG_ASSERT(TG_(current_state).nonskipped != 0);
+	    }
+	}
+
+	jmpkind = jk_Call;
+	call_emulation = True;
+    }
+  }
+
+  if (jmpkind == jk_Call) {
+    fn_node* node = TG_(get_fn_node)(bb);
+    skip = node->skip;
+    if (!skip && !node->obj_skip_checked){
+      HChar* obj_name = node->file->obj->name;
+      // VG_(printf)("  %s\n", obj_name);
+      for (int i=0; i<TG_(clo).objs_to_skip_count; i++) {
+        // VG_(printf)("     %s\n", TG_(clo).objs_to_skip[i]);
+        if (VG_(strcmp)(obj_name, TG_(clo).objs_to_skip[i]) == 0) {
+          node->skip = True;
+          skip = True;
+          break;
+        }
+      }
+      node->obj_skip_checked = True;
+    }
+  }
+
+  TG_DEBUGIF(1) {
+    if (isConditionalJump)
+      VG_(printf)("Cond-");
+    switch(jmpkind) {
+    case jk_None:   VG_(printf)("Fall-through"); break;
+    case jk_Jump:   VG_(printf)("Jump"); break;
+    case jk_Call:   VG_(printf)("Call"); break;
+    case jk_Return: VG_(printf)("Return"); break;
+    default:        tl_assert(0);
+    }
+    VG_(printf)(" %08lx -> %08lx, SP %08lx\n",
+		last_bb ? bb_jmpaddr(last_bb) : 0,
+		bb_addr(bb), sp);
+  }
+
+  /* Handle CALL/RET and update context to get correct BBCC */
+  
+  if (jmpkind == jk_Return) {
+    
+    if ((csp == 0) || 
+	((TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) &&
+	 ( *(TG_(current_fn_stack).top-1)==0)) ) {
+
+      /* On an empty call stack or at a signal separation marker,
+       * a RETURN generates an call stack underflow.
+       */	
+      handleUnderflow(bb);
+      TG_(pop_call_stack)();
+    }
+    else {
+	TG_ASSERT(popcount_on_return >0);
+	TG_(unwind_call_stack)(sp, popcount_on_return);
+    }
+  }
+  else {
+    Int unwind_count = TG_(unwind_call_stack)(sp, 0);
+    if (unwind_count > 0) {
+      /* if unwinding was done, this actually is a return */
+      jmpkind = jk_Return;
+    }
+    
+    if (jmpkind == jk_Call) {
+      delayed_push = True;
+
+      csp = TG_(current_call_stack).sp;
+      if (call_emulation && csp>0)
+	sp = TG_(current_call_stack).entry[csp-1].sp;	
+
+    }
+  }
+  
+  /* Change new context if needed, taking delayed_push into account */
+  if ((delayed_push && !skip) || (TG_(current_state).cxt == 0)) {
+    TG_(push_cxt)(TG_(get_fn_node)(bb));
+  }
+  TG_ASSERT(TG_(current_fn_stack).top > TG_(current_fn_stack).bottom);
+  
+  /* If there is a fresh instrumented BBCC, assign current context */
+  bbcc = TG_(get_bbcc)(bb);
+  if (bbcc->cxt == 0) {
+    TG_ASSERT(bbcc->rec_array == 0);
+      
+    bbcc->cxt = TG_(current_state).cxt;
+    bbcc->rec_array = 
+      new_recursion((*TG_(current_fn_stack).top)->separate_recursions);
+    bbcc->rec_array[0] = bbcc;
+      
+    insert_bbcc_into_hash(bbcc);
+  }
+  else {
+    /* get BBCC with current context */
+    
+    /* first check LRU of last bbcc executed */
+    
+    if (last_bbcc) {
+      bbcc = last_bbcc->lru_next_bbcc;
+      if (bbcc &&
+	  ((bbcc->bb != bb) ||
+	   (bbcc->cxt != TG_(current_state).cxt)))
+	bbcc = 0;
+    }
+    else
+      bbcc = 0;
+
+    if (!bbcc)
+      bbcc = lookup_bbcc(bb, TG_(current_state).cxt);
+    if (!bbcc)
+      bbcc = clone_bbcc(bb->bbcc_list, TG_(current_state).cxt, 0);
+    
+    bb->last_bbcc = bbcc;
+  }
+
+  /* save for fast lookup */
+  if (last_bbcc)
+    last_bbcc->lru_next_bbcc = bbcc;
+
+  if ((*TG_(current_fn_stack).top)->separate_recursions >1) {
+    UInt level, idx;
+    fn_node* top = *(TG_(current_fn_stack).top);
+
+    level = *TG_(get_fn_entry)(top->number);
+
+    if (delayed_push && !skip) {
+      if (TG_(clo).skip_direct_recursion) {
+        /* a call was detected, which means that the source BB != 0 */
+	TG_ASSERT(TG_(current_state).bbcc != 0);
+	/* only increment rec. level if called from different function */ 
+	if (TG_(current_state).bbcc->cxt->fn[0] != bbcc->cxt->fn[0])
+	  level++;
+      }
+      else level++;
+    }
+    if (level> top->separate_recursions)
+      level = top->separate_recursions;
+
+    if (level == 0) {
+      /* can only happen if instrumentation just was switched on */
+      level = 1;
+      *TG_(get_fn_entry)(top->number) = 1;
+    }
+
+    idx = level -1;
+    if (bbcc->rec_array[idx])
+      bbcc = bbcc->rec_array[idx];
+    else
+      bbcc = clone_bbcc(bbcc, TG_(current_state).cxt, idx);
+
+    TG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
+  }
+
+  if (delayed_push) {
+    if (!skip && TG_(current_state).nonskipped) {
+      /* a call from skipped to nonskipped */
+      TG_(current_state).bbcc = TG_(current_state).nonskipped;
+      /* FIXME: take the real passed count from shadow stack */
+      passed = TG_(current_state).bbcc->bb->cjmp_count;
+    }
+    TG_(push_call_stack)(TG_(current_state).bbcc, passed,
+			 bbcc, sp, skip);
+  }
+
+  if (TG_(clo).collect_jumps && (jmpkind == jk_Jump)) {
+    
+    /* Handle conditional jumps followed, i.e. trace arcs
+     * This uses JCC structures, too */
+    
+    jCC* jcc = TG_(get_jcc)(last_bbcc, passed, bbcc);
+    TG_ASSERT(jcc != 0);
+    // Change from default, and check if already changed
+    if (jcc->jmpkind == jk_Call)
+      jcc->jmpkind = isConditionalJump ? jk_CondJump : jk_Jump;
+    else {
+	// FIXME: Why can this fail?
+	// TG_ASSERT(jcc->jmpkind == jmpkind);
+    }
+    
+    jcc->call_counter++;
+    if (isConditionalJump)
+      TG_(stat).jcnd_counter++;
+    else
+      TG_(stat).jump_counter++;
+  }
+  
+  TG_(current_state).bbcc = bbcc;
+  /* Even though this will be set in instrumented code directly before
+   * side exits, it needs to be set to 0 here in case an exception
+   * happens in first instructions of the BB */
+  TG_(current_state).jmps_passed = 0;
+  // needed for log_* handlers called in this BB
+  TG_(bb_base)   = bb->obj->offset + bb->offset;
+  TG_(cost_base) = bbcc->cost;
+  
+  TG_DEBUGIF(1) {
+    VG_(printf)("     ");
+    TG_(print_bbcc_fn)(bbcc);
+    VG_(printf)("\n");
+  }
+  
+  TG_DEBUG(3,"- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n",
+	   bb_addr(bb), bbcc->cost, bb->cost_count, 
+	   bb->instr_count, bb->instr_len);
+  TG_DEBUGIF(3)
+    TG_(print_cxt)(-8, TG_(current_state).cxt, bbcc->rec_index);
+  TG_DEBUG(3,"\n");
+  
+  TG_(stat).bb_executions++;
+}
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
new file mode 100644
index 000000000..d80669174
--- /dev/null
+++ b/tracegrind/callstack.c
@@ -0,0 +1,435 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                               ct_callstack.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Call stack, operations                               ---*/
+/*------------------------------------------------------------*/
+
+/* Stack of current thread. Gets initialized when switching to 1st thread.
+ *
+ * The artificial call stack is an array of call_entry's, representing
+ * stack frames of the executing program. 
+ * Array call_stack and call_stack_esp have same size and grow on demand.
+ * Array call_stack_esp holds SPs of corresponding stack frames.
+ *
+ */
+
+#define N_CALL_STACK_INITIAL_ENTRIES 500
+
+call_stack TG_(current_call_stack);
+
+void TG_(init_call_stack)(call_stack* s)
+{
+  Int i;
+
+  TG_ASSERT(s != 0);
+
+  s->size = N_CALL_STACK_INITIAL_ENTRIES;   
+  s->entry = (call_entry*) TG_MALLOC("cl.callstack.ics.1",
+                                      s->size * sizeof(call_entry));
+  s->sp = 0;
+  s->entry[0].cxt = 0; /* for assertion in push_cxt() */
+
+  for(i=0; i<s->size; i++) s->entry[i].enter_cost = 0;
+}
+
+call_entry* TG_(get_call_entry)(Int sp)
+{
+  TG_ASSERT(sp <= TG_(current_call_stack).sp);
+  return &(TG_(current_call_stack).entry[sp]);
+}
+
+void TG_(copy_current_call_stack)(call_stack* dst)
+{
+  TG_ASSERT(dst != 0);
+
+  dst->size  = TG_(current_call_stack).size;
+  dst->entry = TG_(current_call_stack).entry;
+  dst->sp    = TG_(current_call_stack).sp;
+}
+
+void TG_(set_current_call_stack)(call_stack* s)
+{
+  TG_ASSERT(s != 0);
+
+  TG_(current_call_stack).size  = s->size;
+  TG_(current_call_stack).entry = s->entry;
+  TG_(current_call_stack).sp    = s->sp;
+}
+
+
+static __inline__
+void ensure_stack_size(Int i)
+{
+  Int oldsize;
+  call_stack *cs = &TG_(current_call_stack);
+
+  if (i < cs->size) return;
+
+  oldsize = cs->size;
+  cs->size *= 2;
+  while (i > cs->size) cs->size *= 2;
+
+  cs->entry = (call_entry*) VG_(realloc)("cl.callstack.ess.1",
+                                         cs->entry,
+					 cs->size * sizeof(call_entry));
+
+  for(i=oldsize; i<cs->size; i++)
+    cs->entry[i].enter_cost = 0;
+
+  TG_(stat).call_stack_resizes++;
+ 
+  TG_DEBUGIF(2)
+    VG_(printf)("        call stack enlarged to %u entries\n",
+		TG_(current_call_stack).size);
+}
+
+
+
+/* Called when function entered nonrecursive */
+static void function_entered(fn_node* fn)
+{
+  TG_ASSERT(fn != 0);
+
+#if TG_ENABLE_DEBUG
+  if (fn->verbosity >=0) {
+    Int old = TG_(clo).verbose;
+    TG_(clo).verbose = fn->verbosity;
+    fn->verbosity = old;
+    VG_(message)(Vg_DebugMsg, 
+		 "Entering %s: Verbosity set to %d\n",
+		 fn->name, TG_(clo).verbose);
+  }
+#endif		
+	    
+  if (fn->dump_before) {
+    HChar trigger[VG_(strlen)(fn->name) + 20];
+    VG_(sprintf)(trigger, "--dump-before=%s", fn->name);
+    TG_(dump_profile)(trigger, True);
+  }
+  else if (fn->zero_before) {
+    TG_(zero_all_cost)(True);
+  }
+
+  if (fn->toggle_collect) {
+    TG_(current_state).collect = !TG_(current_state).collect;
+    TG_DEBUG(2,"   entering %s: toggled collection state to %s\n",
+	     fn->name,
+	     TG_(current_state).collect ? "ON" : "OFF");
+  }
+}	
+
+/* Called when function left (no recursive level active) */
+static void function_left(fn_node* fn)
+{
+  TG_ASSERT(fn != 0);
+
+  if (fn->dump_after) {
+    HChar trigger[VG_(strlen)(fn->name) + 20];
+    VG_(sprintf)(trigger, "--dump-after=%s", fn->name);
+    TG_(dump_profile)(trigger, True);
+  }
+  if (fn->toggle_collect) {
+    TG_(current_state).collect = !TG_(current_state).collect;
+    TG_DEBUG(2,"   leaving %s: toggled collection state to %s\n",
+	     fn->name,
+	     TG_(current_state).collect ? "ON" : "OFF");
+  }
+
+#if TG_ENABLE_DEBUG
+  if (fn->verbosity >=0) {
+    Int old = TG_(clo).verbose;
+    TG_(clo).verbose = fn->verbosity;
+    fn->verbosity = old;
+    VG_(message)(Vg_DebugMsg, 
+		 "Leaving %s: Verbosity set back to %d\n",
+		 fn->name, TG_(clo).verbose);
+  }
+#endif		
+}
+
+
+/* Push call on call stack.
+ *
+ * Increment the usage count for the function called.
+ * A jump from <from> to <to>, with <sp>.
+ * If <skip> is true, this is a call to a function to be skipped;
+ * for this, we set jcc = 0.
+ */
+void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
+{
+    jCC* jcc;
+    UInt* pdepth;
+    call_entry* current_entry;
+    Addr ret_addr;
+
+    /* Ensure a call stack of size <current_sp>+1.
+     * The +1 is needed as push_cxt will store the
+     * context at [current_sp]
+     */
+    ensure_stack_size(TG_(current_call_stack).sp +1);
+    current_entry = &(TG_(current_call_stack).entry[TG_(current_call_stack).sp]);
+
+    if (skip) {
+	jcc = 0;
+    }
+    else {
+	fn_node* to_fn = to->cxt->fn[0];
+
+	if (TG_(current_state).nonskipped) {
+	    /* this is a jmp from skipped to nonskipped */
+	    TG_ASSERT(TG_(current_state).nonskipped == from);
+	}
+
+	/* As push_cxt() has to be called before push_call_stack if not
+	 * skipping, the old context should already be saved on the stack */
+	TG_ASSERT(current_entry->cxt != 0);
+	TG_(copy_cost_lz)( TG_(sets).full, &(current_entry->enter_cost),
+			   TG_(current_state).cost );
+
+	jcc = TG_(get_jcc)(from, jmp, to);
+	TG_ASSERT(jcc != 0);
+
+	pdepth = TG_(get_fn_entry)(to_fn->number);
+	if (TG_(clo).skip_direct_recursion) {
+	    /* only increment depth if another function is called */
+	  if (jcc->from->cxt->fn[0] != to_fn) (*pdepth)++;
+	}
+	else (*pdepth)++;
+
+	if (*pdepth>1)
+	  TG_(stat).rec_call_counter++;
+	
+	jcc->call_counter++;
+	TG_(stat).call_counter++;
+
+	if (*pdepth == 1) function_entered(to_fn);
+    }
+
+    /* return address is only is useful with a real call;
+     * used to detect RET w/o CALL */
+    if (from->bb->jmp[jmp].jmpkind == jk_Call) {
+      UInt instr = from->bb->jmp[jmp].instr;
+      ret_addr = bb_addr(from->bb) +
+	from->bb->instr[instr].instr_offset +
+	from->bb->instr[instr].instr_size;
+    }
+    else
+      ret_addr = 0;
+
+    /* put jcc on call stack */
+    current_entry->jcc = jcc;
+    current_entry->sp = sp;
+    current_entry->ret_addr = ret_addr;
+    current_entry->nonskipped = TG_(current_state).nonskipped;
+
+    TG_(current_call_stack).sp++;
+
+    /* To allow for above assertion we set context of next frame to 0 */
+    TG_ASSERT(TG_(current_call_stack).sp < TG_(current_call_stack).size);
+    current_entry++;
+    current_entry->cxt = 0;
+
+    if (!skip)
+	TG_(current_state).nonskipped = 0;
+    else if (!TG_(current_state).nonskipped) {
+	/* a call from nonskipped to skipped */
+	TG_(current_state).nonskipped = from;
+	if (!TG_(current_state).nonskipped->skipped) {
+	  TG_(init_cost_lz)( TG_(sets).full,
+			     &TG_(current_state).nonskipped->skipped);
+	  TG_(stat).distinct_skips++;
+	}
+    }
+
+#if TG_ENABLE_DEBUG
+    TG_DEBUGIF(0) {
+	if (TG_(clo).verbose<2) {
+	  if (jcc && jcc->to && jcc->to->bb) {
+	    const HChar spaces[][41] = {
+                                  "   .   .   .   .   .   .   .   .   .   .",
+				  "  .   .   .   .   .   .   .   .   .   . ",
+				  " .   .   .   .   .   .   .   .   .   .  ",
+				  ".   .   .   .   .   .   .   .   .   .   " };
+
+	    int s = TG_(current_call_stack).sp;
+	    UInt* pars = (UInt*) sp;
+
+	    BB* bb = jcc->to->bb;
+	    if (s>40) s=40;
+	    VG_(printf)("%s> %s(0x%x, 0x%x, ...) [%s / %#lx]\n", spaces[s%4]+40-s, bb->fn->name,
+                        pars ? pars[1]:0,
+			pars ? pars[2]:0,
+			bb->obj->name + bb->obj->last_slash_pos,
+			(UWord)bb->offset);
+	  }
+	}
+	else if (TG_(clo).verbose<4) {
+	    VG_(printf)("+ %2d ", TG_(current_call_stack).sp);
+	    TG_(print_short_jcc)(jcc);
+	    VG_(printf)(", SP %#lx, RA %#lx\n", sp, ret_addr);
+	}
+	else {
+	    VG_(printf)("  Pushed ");
+	    TG_(print_stackentry)(3, TG_(current_call_stack).sp-1);
+	}
+    }
+#endif
+
+}
+
+
+/* Pop call stack and update inclusive sums.
+ * Returns modified fcc.
+ *
+ * If the JCC becomes inactive, call entries are freed if possible
+ */
+void TG_(pop_call_stack)(void)
+{
+    jCC* jcc;
+    Int depth = 0;
+    call_entry* lower_entry;
+
+    if (TG_(current_state).sig >0) {
+	/* Check if we leave a signal handler; this can happen when
+	 * calling longjmp() in the handler */
+	TG_(run_post_signal_on_call_stack_bottom)();
+    }
+
+    lower_entry =
+	&(TG_(current_call_stack).entry[TG_(current_call_stack).sp-1]);
+
+    TG_DEBUG(4,"+ pop_call_stack: frame %d, jcc %p\n", 
+		TG_(current_call_stack).sp, lower_entry->jcc);
+
+    /* jCC item not any more on real stack: pop */
+    jcc = lower_entry->jcc;
+    TG_(current_state).nonskipped = lower_entry->nonskipped;
+
+    if (jcc) {
+	fn_node* to_fn  = jcc->to->cxt->fn[0];
+	UInt* pdepth =  TG_(get_fn_entry)(to_fn->number);
+	if (TG_(clo).skip_direct_recursion) {
+	    /* only decrement depth if another function was called */
+	  if (jcc->from->cxt->fn[0] != to_fn) (*pdepth)--;
+	}
+	else (*pdepth)--;
+	depth = *pdepth;
+
+	/* add cost difference to sum */
+	if ( TG_(add_diff_cost_lz)( TG_(sets).full, &(jcc->cost),
+				    lower_entry->enter_cost,
+				    TG_(current_state).cost) ) {
+	    
+	  /* only count this call if it attributed some cost.
+	   * the ret_counter is used to check if a BBCC dump is needed.
+	   */
+	  jcc->from->ret_counter++;
+	}
+	TG_(stat).ret_counter++;
+
+	/* restore context */
+	TG_(current_state).cxt  = lower_entry->cxt;
+	TG_(current_fn_stack).top =
+	  TG_(current_fn_stack).bottom + lower_entry->fn_sp;
+	TG_ASSERT(TG_(current_state).cxt != 0);
+
+	if (depth == 0) function_left(to_fn);
+    }
+
+    /* To allow for an assertion in push_call_stack() */
+    lower_entry->cxt = 0;
+
+    TG_(current_call_stack).sp--;
+
+#if TG_ENABLE_DEBUG
+    TG_DEBUGIF(1) {
+	if (TG_(clo).verbose<4) {
+	    if (jcc) {
+		/* popped JCC target first */
+		VG_(printf)("- %2d %#lx => ",
+			    TG_(current_call_stack).sp,
+			    bb_addr(jcc->to->bb));
+		TG_(print_addr)(bb_jmpaddr(jcc->from->bb));
+		VG_(printf)(", SP %#lx\n",
+			    TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
+		TG_(print_cost)(10, TG_(sets).full, jcc->cost);
+	    }
+	    else
+		VG_(printf)("- %2d [Skipped JCC], SP %#lx\n",
+			    TG_(current_call_stack).sp,
+			    TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
+	}
+	else {
+	    VG_(printf)("  Popped ");
+	    TG_(print_stackentry)(7, TG_(current_call_stack).sp);
+	    if (jcc) {
+		VG_(printf)("       returned to ");
+		TG_(print_addr_ln)(bb_jmpaddr(jcc->from->bb));
+	    }
+	}
+    }
+#endif
+
+}
+
+
+/* Unwind enough CallStack items to sync with current stack pointer.
+ * Returns the number of stack frames unwinded.
+ */
+Int TG_(unwind_call_stack)(Addr sp, Int minpops)
+{
+    Int csp;
+    Int unwind_count = 0;
+    TG_DEBUG(4,"+ unwind_call_stack(sp %#lx, minpops %d): frame %d\n",
+	      sp, minpops, TG_(current_call_stack).sp);
+
+    /* We pop old stack frames.
+     * For a call, be p the stack address with return address.
+     *  - call_stack_esp[] has SP after the CALL: p-4
+     *  - current sp is after a RET: >= p
+     */
+    
+    while( (csp=TG_(current_call_stack).sp) >0) {
+	call_entry* top_ce = &(TG_(current_call_stack).entry[csp-1]);
+
+	if ((top_ce->sp < sp) ||
+	    ((top_ce->sp == sp) && minpops>0)) {
+
+	    minpops--;
+	    unwind_count++;
+	    TG_(pop_call_stack)();
+	    csp=TG_(current_call_stack).sp;
+	    continue;
+	}
+	break;
+    }
+
+    TG_DEBUG(4,"- unwind_call_stack\n");
+    return unwind_count;
+}
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
new file mode 100644
index 000000000..cd3a05012
--- /dev/null
+++ b/tracegrind/clo.c
@@ -0,0 +1,700 @@
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h" // for VG_PREFIX
+
+#include "global.h"
+
+
+
+/*------------------------------------------------------------*/
+/*--- Function specific configuration options              ---*/
+/*------------------------------------------------------------*/
+
+/* Special value for separate_callers: automatic = adaptive */
+#define CONFIG_AUTO    -1
+
+#define CONFIG_DEFAULT -1
+#define CONFIG_FALSE    0
+#define CONFIG_TRUE     1
+
+/* Logging configuration for a function */
+struct _fn_config {
+    Int dump_before;
+    Int dump_after;
+    Int zero_before;
+    Int toggle_collect;
+
+    Int skip;    /* Handle CALL to this function as JMP (= Skip)? */
+    Int group;   /* don't change caller dependency inside group !=0 */
+    Int pop_on_jump; 
+
+    Int separate_callers;    /* separate logging dependent on caller  */
+    Int separate_recursions; /* separate logging of rec. levels       */
+
+#if TG_ENABLE_DEBUG
+    Int verbosity; /* Change debug verbosity level while in function */
+#endif
+};
+
+/* Configurations for function name prefix patterns.
+ * Currently, only very limit patterns are possible:
+ * Exact prefix patterns and "*::" are allowed.
+ * E.g.
+ *  - "abc" matches all functions starting with "abc".
+ *  - "abc*::def" matches all functions starting with "abc" and
+ *    starting with "def" after the first "::" separator.
+ *  - "*::print(" matches C++ methods "print" in all classes
+ *    without namespace. I.e. "*" doesn't match a "::".
+ *
+ * We build a trie from patterns, and for a given function, we
+ * go down the tree and apply all non-default configurations.
+ */
+
+
+#define NODE_DEGREE 30
+
+/* node of compressed trie search structure */
+typedef struct _config_node config_node;
+struct _config_node {
+  Int length;
+    
+  fn_config* config;
+  config_node* sub_node[NODE_DEGREE];
+  config_node* next;
+  config_node* wild_star;
+  config_node* wild_char;
+
+  HChar name[1];
+};
+
+/* root of trie */
+static config_node* fn_configs = 0;
+
+static __inline__ 
+fn_config* new_fnc(void)
+{
+   fn_config* fnc = (fn_config*) TG_MALLOC("cl.clo.nf.1",
+                                            sizeof(fn_config));
+
+   fnc->dump_before  = CONFIG_DEFAULT;
+   fnc->dump_after   = CONFIG_DEFAULT;
+   fnc->zero_before  = CONFIG_DEFAULT;
+   fnc->toggle_collect = CONFIG_DEFAULT;
+   fnc->skip         = CONFIG_DEFAULT;
+   fnc->pop_on_jump  = CONFIG_DEFAULT;
+   fnc->group        = CONFIG_DEFAULT;
+   fnc->separate_callers    = CONFIG_DEFAULT;
+   fnc->separate_recursions = CONFIG_DEFAULT;
+
+#if TG_ENABLE_DEBUG
+   fnc->verbosity    = CONFIG_DEFAULT;
+#endif
+
+   return fnc;
+}
+
+
+static config_node* new_config(const HChar* name, int length)
+{
+    int i;
+    config_node* node = (config_node*) TG_MALLOC("cl.clo.nc.1",
+                                                  sizeof(config_node) + length);
+
+    for(i=0;i<length;i++) {
+      if (name[i] == 0) break;
+      node->name[i] = name[i];
+    }
+    node->name[i] = 0;
+
+    node->length = length;
+    node->config = 0;
+    for(i=0;i<NODE_DEGREE;i++)
+	node->sub_node[i] = 0;
+    node->next = 0;
+    node->wild_char = 0;
+    node->wild_star = 0;
+
+    TG_DEBUG(3, "   new_config('%s', len %d)\n", node->name, length);
+
+    return node;
+}
+
+static __inline__
+Bool is_wild(HChar n)
+{
+  return (n == '*') || (n == '?');
+}
+
+/* Recursively build up function matching tree (prefix tree).
+ * Returns function config object for pattern <name>
+ * and starting at tree node <*pnode>.
+ *
+ * Tree nodes (config_node) are created as needed,
+ * tree root is stored into <*pnode>, and the created
+ * leaf (fn_config) for the given pattern is returned.
+ */
+static fn_config* get_fnc2(config_node* node, const HChar* name)
+{
+  config_node *new_sub, *n, *nprev;
+  int offset, len;
+
+  TG_DEBUG(3, "  get_fnc2(%p, '%s')\n", node, name);
+
+  if (name[0] == 0) {
+    if (!node->config) node->config = new_fnc();
+    return node->config;
+  }
+
+  if (is_wild(*name)) {
+    if (*name == '*') {
+      while(name[1] == '*') name++;
+      new_sub = node->wild_star;
+    }
+    else
+      new_sub = node->wild_char;
+
+    if (!new_sub) {
+      new_sub = new_config(name, 1);
+      if (*name == '*')
+	node->wild_star = new_sub;
+      else
+	node->wild_char = new_sub;
+    }
+
+    return get_fnc2( new_sub, name+1);
+  }
+
+  n = node->sub_node[ name[0]%NODE_DEGREE ];
+  nprev = 0;
+  len = 0;
+  while(n) {
+    for(len=0; name[len] == n->name[len]; len++);
+    if (len>0) break;
+    nprev = n;
+    n = n->next;
+  }
+
+  if (!n) {
+    len = 1;
+    while(name[len] && (!is_wild(name[len]))) len++;
+    new_sub = new_config(name, len);
+    new_sub->next = node->sub_node[ name[0]%NODE_DEGREE ];
+    node->sub_node[ name[0]%NODE_DEGREE ] = new_sub;	
+
+    if (name[len] == 0) {
+      new_sub->config = new_fnc();
+      return new_sub->config;
+    }
+    
+    /* recurse on wildcard */
+    return get_fnc2( new_sub, name+len);
+  }
+
+  if (len < n->length) {
+
+    /* split up the subnode <n> */
+    config_node *new_node;
+    int i;
+
+    new_node = new_config(n->name, len);
+    if (nprev)
+      nprev->next = new_node;
+    else
+      node->sub_node[ n->name[0]%NODE_DEGREE ] = new_node;
+    new_node->next = n->next;
+
+    new_node->sub_node[ n->name[len]%NODE_DEGREE ] = n;
+
+    for(i=0, offset=len; offset < n->length; i++, offset++)
+      n->name[i] = n->name[offset];
+    n->name[i] = 0;
+    n->length = i;
+
+    name += len;
+    offset = 0;
+    while(name[offset] && (!is_wild(name[offset]))) offset++;
+    new_sub  = new_config(name, offset);
+    /* this sub_node of new_node could already be set: chain! */
+    new_sub->next = new_node->sub_node[ name[0]%NODE_DEGREE ];
+    new_node->sub_node[ name[0]%NODE_DEGREE ] = new_sub;
+
+    if (name[offset]==0) {
+      new_sub->config = new_fnc();
+      return new_sub->config;
+    }
+
+    /* recurse on wildcard */
+    return get_fnc2( new_sub, name+offset);
+  }
+
+  name += n->length;
+
+  if (name[0] == 0) {
+    /* name and node name are the same */
+    if (!n->config) n->config = new_fnc();
+    return n->config;
+  }
+
+  offset = 1;
+  while(name[offset] && (!is_wild(name[offset]))) offset++;
+
+  new_sub = new_config(name, offset);
+  new_sub->next = n->sub_node[ name[0]%NODE_DEGREE ];
+  n->sub_node[ name[0]%NODE_DEGREE ] = new_sub;
+
+  return get_fnc2(new_sub, name+offset);
+}
+
+static void print_config_node(int depth, int hash, config_node* node)
+{
+  config_node* n;
+  int i;
+
+  if (node != fn_configs) {
+    const HChar sp[] = "                                        ";
+
+    if (depth>40) depth=40;
+    VG_(printf)("%s", sp+40-depth);
+    if (hash >=0) VG_(printf)(" [hash %2d]", hash);
+    else if (hash == -2) VG_(printf)(" [wildc ?]");
+    else if (hash == -3) VG_(printf)(" [wildc *]");
+    VG_(printf)(" '%s' (len %d)\n", node->name, node->length);
+  }
+  for(i=0;i<NODE_DEGREE;i++) {
+    n = node->sub_node[i];
+    while(n) {
+      print_config_node(depth+1, i, n);
+      n = n->next;
+    }
+  }
+  if (node->wild_char) print_config_node(depth+1, -2, node->wild_char);
+  if (node->wild_star) print_config_node(depth+1, -3, node->wild_star);
+}
+
+/* get a function config for a name pattern (from command line) */
+static fn_config* get_fnc(const HChar* name)
+{
+  fn_config* fnc;
+
+  TG_DEBUG(3, " +get_fnc(%s)\n", name);
+  if (fn_configs == 0)
+    fn_configs = new_config(name, 0);
+  fnc =  get_fnc2(fn_configs, name);
+
+  TG_DEBUGIF(3) {
+    TG_DEBUG(3, " -get_fnc(%s):\n", name);
+    print_config_node(3, -1, fn_configs);
+  }
+  return fnc;
+}
+
+  
+
+static void update_fn_config1(fn_node* fn, fn_config* fnc)
+{
+    if (fnc->dump_before != CONFIG_DEFAULT)
+	fn->dump_before = (fnc->dump_before == CONFIG_TRUE);
+
+    if (fnc->dump_after != CONFIG_DEFAULT)
+	fn->dump_after = (fnc->dump_after == CONFIG_TRUE);
+
+    if (fnc->zero_before != CONFIG_DEFAULT)
+	fn->zero_before = (fnc->zero_before == CONFIG_TRUE);
+
+    if (fnc->toggle_collect != CONFIG_DEFAULT)
+	fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
+
+    if (fnc->skip != CONFIG_DEFAULT)
+	fn->skip = (fnc->skip == CONFIG_TRUE);
+
+    if (fnc->pop_on_jump != CONFIG_DEFAULT)
+	fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE);
+
+    if (fnc->group != CONFIG_DEFAULT)
+	fn->group = fnc->group;
+
+    if (fnc->separate_callers != CONFIG_DEFAULT)
+	fn->separate_callers = fnc->separate_callers;
+
+    if (fnc->separate_recursions != CONFIG_DEFAULT)
+	fn->separate_recursions = fnc->separate_recursions;
+
+#if TG_ENABLE_DEBUG
+    if (fnc->verbosity != CONFIG_DEFAULT)
+	fn->verbosity = fnc->verbosity;
+#endif
+}
+
+/* Recursively go down the function matching tree,
+ * looking for a match to <name>. For every matching leaf,
+ * <fn> is updated with the pattern config.
+ */
+static void update_fn_config2(fn_node* fn, const HChar* name,
+                              config_node* node)
+{
+    config_node* n;
+
+    TG_DEBUG(3, "  update_fn_config2('%s', node '%s'): \n",
+	     name, node->name);
+    if ((*name == 0) && node->config) {
+      TG_DEBUG(3, "   found!\n");
+      update_fn_config1(fn, node->config);
+      return;
+    }
+
+    n = node->sub_node[ name[0]%NODE_DEGREE ];
+    while(n) {
+      if (VG_(strncmp)(name, n->name, n->length)==0) break;
+      n = n->next;
+    }
+    if (n) {
+	TG_DEBUG(3, "   '%s' matching at hash %d\n",
+		  n->name, name[0]%NODE_DEGREE);
+	update_fn_config2(fn, name+n->length, n);
+    }
+    
+    if (node->wild_char) {
+	TG_DEBUG(3, "   skip '%c' for wildcard '?'\n", *name);
+	update_fn_config2(fn, name+1, node->wild_char);
+    }
+
+    if (node->wild_star) {
+      TG_DEBUG(3, "   wildcard '*'\n");
+      while(*name) {
+	update_fn_config2(fn, name, node->wild_star);
+	name++;
+      }
+      update_fn_config2(fn, name, node->wild_star);
+    }
+}
+
+/* Update function config according to configs of name prefixes */
+void TG_(update_fn_config)(fn_node* fn)
+{
+    TG_DEBUG(3, "  update_fn_config('%s')\n", fn->name);
+    if (fn_configs)
+      update_fn_config2(fn, fn->name, fn_configs);
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- Command line processing                                      ---*/
+/*--------------------------------------------------------------------*/
+
+Bool TG_(process_cmd_line_option)(const HChar* arg)
+{
+   const HChar* tmp_str;
+
+   if      VG_BOOL_CLO(arg, "--skip-plt", TG_(clo).skip_plt) {}
+
+   else if VG_BOOL_CLO(arg, "--collect-jumps", TG_(clo).collect_jumps) {}
+   /* compatibility alias, deprecated option */
+   else if VG_BOOL_CLO(arg, "--trace-jump",    TG_(clo).collect_jumps) {}
+
+   else if VG_BOOL_CLO(arg, "--combine-dumps", TG_(clo).combine_dumps) {}
+
+   else if VG_BOOL_CLO(arg, "--collect-atstart", TG_(clo).collect_atstart) {}
+
+   else if VG_BOOL_CLO(arg, "--instr-atstart", TG_(clo).instrument_atstart) {}
+
+   else if VG_BOOL_CLO(arg, "--separate-threads", TG_(clo).separate_threads) {}
+
+   else if VG_BOOL_CLO(arg, "--compress-strings", TG_(clo).compress_strings) {}
+   else if VG_BOOL_CLO(arg, "--compress-mangled", TG_(clo).compress_mangled) {}
+   else if VG_BOOL_CLO(arg, "--compress-pos",     TG_(clo).compress_pos) {}
+
+   else if VG_STR_CLO(arg, "--fn-skip", tmp_str) {
+       fn_config* fnc = get_fnc(tmp_str);
+       fnc->skip = CONFIG_TRUE;
+   }
+   else if VG_STR_CLO(arg, "--obj-skip", tmp_str) {
+       HChar *obj_name = VG_(strdup)("cl.clo.pclo.1", tmp_str);
+       TG_(clo).objs_to_skip_count++;
+       TG_(clo).objs_to_skip = VG_(realloc)("cl.clo.pclo.2",
+                                             TG_(clo).objs_to_skip,
+                                             TG_(clo).objs_to_skip_count*sizeof(HChar*));
+       TG_(clo).objs_to_skip[TG_(clo).objs_to_skip_count-1] = obj_name;
+   }
+
+   else if VG_STR_CLO(arg, "--dump-before", tmp_str) {
+       fn_config* fnc = get_fnc(tmp_str);
+       fnc->dump_before = CONFIG_TRUE;
+   }
+
+   else if VG_STR_CLO(arg, "--zero-before", tmp_str) {
+       fn_config* fnc = get_fnc(tmp_str);
+       fnc->zero_before = CONFIG_TRUE;
+   }
+
+   else if VG_STR_CLO(arg, "--dump-after", tmp_str) {
+       fn_config* fnc = get_fnc(tmp_str);
+       fnc->dump_after = CONFIG_TRUE;
+   }
+
+   else if VG_STR_CLO(arg, "--toggle-collect", tmp_str) {
+       fn_config* fnc = get_fnc(tmp_str);
+       fnc->toggle_collect = CONFIG_TRUE;
+       /* defaults to initial collection off */
+       TG_(clo).collect_atstart = False;
+   }
+
+   else if VG_INT_CLO(arg, "--separate-recs", TG_(clo).separate_recursions) {}
+
+   /* change handling of a jump between functions to ret+call */
+   else if VG_XACT_CLO(arg, "--pop-on-jump", TG_(clo).pop_on_jump, True) {}
+   else if VG_STR_CLO( arg, "--pop-on-jump", tmp_str) {
+       fn_config* fnc = get_fnc(tmp_str);
+       fnc->pop_on_jump = CONFIG_TRUE;
+   }
+
+#if TG_ENABLE_DEBUG
+   else if VG_INT_CLO(arg, "--ct-verbose", TG_(clo).verbose) {}
+   else if VG_INT_CLO(arg, "--ct-vstart",  TG_(clo).verbose_start) {}
+
+   else if VG_STREQN(12, arg, "--ct-verbose") {
+       fn_config* fnc;
+       HChar* s;
+       UInt n = VG_(strtoll10)(arg+12, &s);
+       if ((n <= 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->verbosity = n;
+   }
+#endif
+
+   else if VG_XACT_CLO(arg, "--separate-callers=auto", 
+                            TG_(clo).separate_callers, CONFIG_AUTO) {}
+   else if VG_INT_CLO( arg, "--separate-callers", 
+                            TG_(clo).separate_callers) {}
+
+   else if VG_STREQN(10, arg, "--fn-group") {
+       fn_config* fnc;
+       HChar* s;
+       UInt n = VG_(strtoll10)(arg+10, &s);
+       if ((n <= 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->group = n;
+   }
+
+   else if VG_STREQN(18, arg, "--separate-callers") {
+       fn_config* fnc;
+       HChar* s;
+       UInt n = VG_(strtoll10)(arg+18, &s);
+       if ((n <= 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->separate_callers = n;
+   }
+
+   else if VG_STREQN(15, arg, "--separate-recs") {
+       fn_config* fnc;
+       HChar* s;
+       UInt n = VG_(strtoll10)(arg+15, &s);
+       if ((n <= 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->separate_recursions = n;
+   }
+
+   else if VG_STR_CLO(arg, "--tracegrind-out-file", TG_(clo).out_format) {}
+
+   else if VG_BOOL_CLO(arg, "--mangle-names", TG_(clo).mangle_names) {}
+
+   else if VG_BOOL_CLO(arg, "--skip-direct-rec",
+                            TG_(clo).skip_direct_recursion) {}
+
+   else if VG_BOOL_CLO(arg, "--dump-bbs",   TG_(clo).dump_bbs) {}
+   else if VG_BOOL_CLO(arg, "--dump-line",  TG_(clo).dump_line) {}
+   else if VG_BOOL_CLO(arg, "--dump-instr", TG_(clo).dump_instr) {}
+   else if VG_BOOL_CLO(arg, "--dump-bb",    TG_(clo).dump_bb) {}
+
+   else if VG_INT_CLO( arg, "--dump-every-bb", TG_(clo).dump_every_bb) {}
+
+   else if VG_BOOL_CLO(arg, "--collect-alloc",   TG_(clo).collect_alloc) {}
+   else if VG_XACT_CLO(arg, "--collect-systime=no",
+                       TG_(clo).collect_systime, systime_no) {}
+   else if VG_XACT_CLO(arg, "--collect-systime=msec",
+                       TG_(clo).collect_systime, systime_msec) {}
+   else if VG_XACT_CLO(arg, "--collect-systime=yes", /* backward compatibility.  */
+                       TG_(clo).collect_systime, systime_msec) {}
+   else if VG_XACT_CLO(arg, "--collect-systime=usec",
+                       TG_(clo).collect_systime, systime_usec) {}
+   else if VG_XACT_CLO(arg, "--collect-systime=nsec",
+                       TG_(clo).collect_systime, systime_nsec) {
+#  if defined(VGO_darwin)
+      VG_(fmsg_bad_option)
+         (arg,
+          "--collect-systime=nsec not supported on darwin\n");
+#  endif
+   }
+
+   else if VG_BOOL_CLO(arg, "--collect-bus",     TG_(clo).collect_bus) {}
+   /* for option compatibility with cachegrind */
+   else if VG_BOOL_CLO(arg, "--cache-sim",       TG_(clo).simulate_cache) {}
+   /* compatibility alias, deprecated option */
+   else if VG_BOOL_CLO(arg, "--simulate-cache",  TG_(clo).simulate_cache) {}
+   /* for option compatibility with cachegrind */
+   else if VG_BOOL_CLO(arg, "--branch-sim",      TG_(clo).simulate_branch) {}
+   else {
+       Bool isCachesimOption = (*TG_(cachesim).parse_opt)(arg);
+
+       /* cache simulator is used if a simulator option is given */
+       if (isCachesimOption)
+	   TG_(clo).simulate_cache = True;
+
+       return isCachesimOption;
+   }
+
+   return True;
+}
+
+void TG_(print_usage)(void)
+{
+   VG_(printf)(
+"\n   dump creation options:\n"
+"    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p]\n"
+"    --dump-line=no|yes        Dump source lines of costs? [yes]\n"
+"    --dump-instr=no|yes       Dump instruction address of costs? [no]\n"
+"    --compress-strings=no|yes Compress strings in profile dump? [yes]\n"
+"    --compress-pos=no|yes     Compress positions in profile dump? [yes]\n"
+"    --combine-dumps=no|yes    Concat all dumps into same file [no]\n"
+#if TG_EXPERIMENTAL
+"    --compress-events=no|yes  Compress events in profile dump? [no]\n"
+"    --dump-bb=no|yes          Dump basic block address of costs? [no]\n"
+"    --dump-bbs=no|yes         Dump basic block info? [no]\n"
+"    --dump-skipped=no|yes     Dump info on skipped functions in calls? [no]\n"
+"    --mangle-names=no|yes     Mangle separation into names? [yes]\n"
+#endif
+
+"\n   activity options (for interactivity use tracegrind_control):\n"
+"    --dump-every-bb=<count>   Dump every <count> basic blocks [0=never]\n"
+"    --dump-before=<func>      Dump when entering function\n"
+"    --zero-before=<func>      Zero all costs when entering function\n"
+"    --dump-after=<func>       Dump when leaving function\n"
+#if TG_EXPERIMENTAL
+"    --dump-objs=no|yes        Dump static object information [no]\n"
+#endif
+
+"\n   data collection options:\n"
+"    --instr-atstart=no|yes    Do instrumentation at tracegrind start [yes]\n"
+"    --collect-atstart=no|yes  Collect at process/thread start [yes]\n"
+"    --toggle-collect=<func>   Toggle collection on enter/leave function\n"
+"    --collect-jumps=no|yes    Collect jumps? [no]\n"
+"    --collect-bus=no|yes      Collect global bus events? [no]\n"
+#if TG_EXPERIMENTAL
+"    --collect-alloc=no|yes    Collect memory allocation info? [no]\n"
+#endif
+"    --collect-systime=no|yes|msec|usec|nsec  Collect system call time info? [no]\n"
+"        no         Do not collect system call time info.\n"
+"        msec|yes   Collect syscount, syscall elapsed time (milli-seconds).\n"
+"        usec       Collect syscount, syscall elapsed time (micro-seconds).\n"
+"        nsec       Collect syscount, syscall elapsed and syscall cpu time (nano-seconds).\n"
+
+"\n   cost entity separation options:\n"
+"    --separate-threads=no|yes Separate data per thread [no]\n"
+"    --separate-callers=<n>    Separate functions by call chain length [0]\n"
+"    --separate-callers<n>=<f> Separate <n> callers for function <f>\n"
+"    --separate-recs=<n>       Separate function recursions up to level [2]\n"
+"    --separate-recs<n>=<f>    Separate <n> recursions for function <f>\n"
+"    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
+"    --skip-direct-rec=no|yes  Ignore direct recursions? [yes]\n"
+"    --fn-skip=<function>      Ignore calls to/from function?\n"
+"    --obj-skip=<object>       Ignore calls to/from object?\n"
+#if TG_EXPERIMENTAL
+"    --fn-group<no>=<func>     Put function into separation group <no>\n"
+#endif
+"\n   simulation options:\n"
+"    --branch-sim=no|yes       Do branch prediction simulation [no]\n"
+"    --cache-sim=no|yes        Do cache simulation [no]\n"
+    );
+
+   (*TG_(cachesim).print_opts)();
+
+//   VG_(printf)("\n"
+//	       "  For full tracegrind documentation, see\n"
+//	       "  "VG_PREFIX"/share/doc/tracegrind/html/tracegrind.html\n\n");
+}
+
+void TG_(print_debug_usage)(void)
+{
+    VG_(printf)(
+
+#if TG_ENABLE_DEBUG
+"    --ct-verbose=<level>       Verbosity of standard debug output [0]\n"
+"    --ct-vstart=<BB number>    Only be verbose after basic block [0]\n"
+"    --ct-verbose<level>=<func> Verbosity while in <func>\n"
+#else
+"    (none)\n"
+#endif
+
+    );
+}
+
+
+void TG_(set_clo_defaults)(void)
+{
+  /* Default values for command line arguments */
+
+  /* dump options */
+  TG_(clo).out_format       = 0;
+  TG_(clo).combine_dumps    = False;
+  TG_(clo).compress_strings = True;
+  TG_(clo).compress_mangled = False;
+  TG_(clo).compress_events  = False;
+  TG_(clo).compress_pos     = True;
+  TG_(clo).mangle_names     = True;
+  TG_(clo).dump_line        = True;
+  TG_(clo).dump_instr       = False;
+  TG_(clo).dump_bb          = False;
+  TG_(clo).dump_bbs         = False;
+
+  TG_(clo).dump_every_bb    = 0;
+
+  /* Collection */
+  TG_(clo).separate_threads = False;
+  TG_(clo).collect_atstart  = True;
+  TG_(clo).collect_jumps    = False;
+  TG_(clo).collect_alloc    = False;
+  TG_(clo).collect_systime  = systime_no;
+  TG_(clo).collect_bus      = False;
+
+  TG_(clo).skip_plt         = True;
+  TG_(clo).separate_callers = 0;
+  TG_(clo).separate_recursions = 2;
+  TG_(clo).skip_direct_recursion = False;
+
+  /* Instrumentation */
+  TG_(clo).instrument_atstart = True;
+  TG_(clo).simulate_cache = False;
+  TG_(clo).simulate_branch = False;
+
+  /* Call graph */
+  TG_(clo).pop_on_jump = False;
+  TG_(clo).objs_to_skip_count = 0;
+  TG_(clo).objs_to_skip = 0;
+
+#if TG_ENABLE_DEBUG
+  TG_(clo).verbose = 0;
+  TG_(clo).verbose_start = 0;
+#endif
+}
diff --git a/tracegrind/context.c b/tracegrind/context.c
new file mode 100644
index 000000000..e80234891
--- /dev/null
+++ b/tracegrind/context.c
@@ -0,0 +1,330 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                 ct_context.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Context operations                                   ---*/
+/*------------------------------------------------------------*/
+
+#define N_FNSTACK_INITIAL_ENTRIES 500
+#define N_CXT_INITIAL_ENTRIES 2537
+
+fn_stack TG_(current_fn_stack);
+
+void TG_(init_fn_stack)(fn_stack* s)
+{
+  TG_ASSERT(s != 0);
+
+  s->size   = N_FNSTACK_INITIAL_ENTRIES;   
+  s->bottom = (fn_node**) TG_MALLOC("cl.context.ifs.1",
+                                     s->size * sizeof(fn_node*));
+  s->top    = s->bottom;
+  s->bottom[0] = 0;
+}
+
+void TG_(copy_current_fn_stack)(fn_stack* dst)
+{
+  TG_ASSERT(dst != 0);
+
+  dst->size   = TG_(current_fn_stack).size;
+  dst->bottom = TG_(current_fn_stack).bottom;
+  dst->top    = TG_(current_fn_stack).top;
+}
+
+void TG_(set_current_fn_stack)(fn_stack* s)
+{
+  TG_ASSERT(s != 0);
+
+  TG_(current_fn_stack).size   = s->size;
+  TG_(current_fn_stack).bottom = s->bottom;
+  TG_(current_fn_stack).top    = s->top;
+}
+
+static cxt_hash cxts;
+
+void TG_(init_cxt_table)(void)
+{
+   Int i;
+   
+   cxts.size    = N_CXT_INITIAL_ENTRIES;
+   cxts.entries = 0;
+   cxts.table   = (Context**) TG_MALLOC("cl.context.ict.1",
+                                         cxts.size * sizeof(Context*));
+
+   for (i = 0; i < cxts.size; i++)
+     cxts.table[i] = 0;
+}
+
+/* double size of cxt table  */
+static void resize_cxt_table(void)
+{
+    UInt i, new_size, conflicts1 = 0, conflicts2 = 0;
+    Context **new_table, *curr, *next;
+    UInt new_idx;
+
+    new_size  = 2* cxts.size +3;
+    new_table = (Context**) TG_MALLOC("cl.context.rct.1",
+                                       new_size * sizeof(Context*));
+
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+
+    for (i = 0; i < cxts.size; i++) {
+        if (cxts.table[i] == NULL) continue;
+
+        curr = cxts.table[i];
+        while (NULL != curr) {
+            next = curr->next;
+
+            new_idx = (UInt) (curr->hash % new_size);
+
+            curr->next = new_table[new_idx];
+            new_table[new_idx] = curr;
+            if (curr->next) {
+                conflicts1++;
+                if (curr->next->next)
+                    conflicts2++;
+            }
+
+            curr = next;
+        }
+    }
+
+    VG_(free)(cxts.table);
+
+
+    TG_DEBUG(0, "Resize Context Hash: %u => %u (entries %u, conflicts %u/%u)\n",
+             cxts.size, new_size,
+             cxts.entries, conflicts1, conflicts2);
+
+    cxts.size  = new_size;
+    cxts.table = new_table;
+    TG_(stat).cxt_hash_resizes++;
+}
+
+__inline__
+static UWord cxt_hash_val(fn_node** fn, UInt size)
+{
+    UWord hash = 0;
+    UInt count = size;
+    while(*fn != 0) {
+        hash = (hash<<7) + (hash>>25) + (UWord)(*fn);
+        fn--;
+        count--;
+        if (count==0) break;
+    }
+    return hash;
+}
+
+__inline__
+static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt)
+{
+    int count;
+    fn_node** cxt_fn;
+
+    if (hash != cxt->hash) return False;
+
+    count = cxt->size;
+    cxt_fn = &(cxt->fn[0]);
+    while((*fn != 0) && (count>0)) {
+        if (*cxt_fn != *fn) return False;
+        fn--;
+        cxt_fn++;
+        count--;
+    }
+    return True;
+}
+
+/**
+ * Allocate new Context structure
+ */
+static Context* new_cxt(fn_node** fn)
+{
+    Context* cxt;
+    UInt idx, offset;
+    UWord hash;
+    int size, recs;
+    fn_node* top_fn;
+
+    TG_ASSERT(fn);
+    top_fn = *fn;
+    if (top_fn == 0) return 0;
+
+    size = top_fn->separate_callers +1;
+    recs = top_fn->separate_recursions;
+    if (recs<1) recs=1;
+
+    /* check fill degree of context hash table and resize if needed (>80%) */
+    cxts.entries++;
+    if (10 * cxts.entries / cxts.size > 8)
+        resize_cxt_table();
+
+    cxt = (Context*) TG_MALLOC("cl.context.nc.1",
+                                sizeof(Context)+sizeof(fn_node*)*size);
+
+    // hash value calculation similar to cxt_hash_val(), but additionally
+    // copying function pointers in one run
+    hash = 0;
+    offset = 0;
+    while(*fn != 0) {
+        hash = (hash<<7) + (hash>>25) + (UWord)(*fn);
+	cxt->fn[offset] = *fn;
+        offset++;
+        fn--;
+        if (offset >= size) break;
+    }
+    if (offset < size) size = offset;
+
+    cxt->size        = size;
+    cxt->base_number = TG_(stat).context_counter;
+    cxt->hash        = hash;
+
+    TG_(stat).context_counter += recs;
+    TG_(stat).distinct_contexts++;
+
+    /* insert into Context hash table */
+    idx = (UInt) (hash % cxts.size);
+    cxt->next = cxts.table[idx];
+    cxts.table[idx] = cxt;
+
+#if TG_ENABLE_DEBUG
+    TG_DEBUGIF(3) {
+      VG_(printf)("  new_cxt ox%p: ", cxt);
+      TG_(print_cxt)(12, cxt, 0);
+    }
+#endif
+
+    return cxt;
+}
+
+/* get the Context structure for current context */
+Context* TG_(get_cxt)(fn_node** fn)
+{
+    Context* cxt;
+    UInt size, idx;
+    UWord hash;
+
+    TG_ASSERT(fn != 0);
+    if (*fn == 0) return 0;
+    size = (*fn)->separate_callers+1;
+    if (size<=0) { size = -size+1; }
+
+    TG_DEBUG(5, "+ get_cxt(fn '%s'): size %u\n",
+                (*fn)->name, size);
+
+    hash = cxt_hash_val(fn, size);
+
+    if ( ((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) {
+        TG_DEBUG(5, "- get_cxt: %p\n", cxt);
+        return cxt;
+    }
+
+    TG_(stat).cxt_lru_misses++;
+
+    idx = (UInt) (hash % cxts.size);
+    cxt = cxts.table[idx];
+
+    while(cxt) {
+        if (is_cxt(hash,fn,cxt)) break;
+        cxt = cxt->next;
+    }
+
+    if (!cxt)
+        cxt = new_cxt(fn);
+
+    (*fn)->last_cxt = cxt;
+
+    TG_DEBUG(5, "- get_cxt: %p\n", cxt);
+
+    return cxt;
+}
+
+
+/**
+ * Change execution context by calling a new function from current context
+ * Pushing 0x0 specifies a marker for a signal handler entry
+ */
+void TG_(push_cxt)(fn_node* fn)
+{
+  call_stack* cs = &TG_(current_call_stack);
+  Int fn_entries;
+
+  TG_DEBUG(5, "+ push_cxt(fn '%s'): old ctx %d\n", 
+	    fn ? fn->name : "0x0",
+	    TG_(current_state).cxt ?
+	    (Int)TG_(current_state).cxt->base_number : -1);
+
+  /* save old context on stack (even if not changed at all!) */
+  TG_ASSERT(cs->sp < cs->size);
+  TG_ASSERT(cs->entry[cs->sp].cxt == 0);
+  cs->entry[cs->sp].cxt = TG_(current_state).cxt;
+  cs->entry[cs->sp].fn_sp = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
+
+  if (fn && (*(TG_(current_fn_stack).top) == fn)) return;
+  if (fn && (fn->group>0) &&
+      ((*(TG_(current_fn_stack).top))->group == fn->group)) return;
+
+  /* resizing needed ? */
+  fn_entries = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
+  if (fn_entries == TG_(current_fn_stack).size-1) {
+    UInt new_size = TG_(current_fn_stack).size *2;
+    fn_node** new_array = (fn_node**) TG_MALLOC("cl.context.pc.1",
+						 new_size * sizeof(fn_node*));
+    int i;
+    for(i=0;i<TG_(current_fn_stack).size;i++)
+      new_array[i] = TG_(current_fn_stack).bottom[i];
+    VG_(free)(TG_(current_fn_stack).bottom);
+    TG_(current_fn_stack).top = new_array + fn_entries;
+    TG_(current_fn_stack).bottom = new_array;
+
+    TG_DEBUG(0, "Resize Context Stack: %u => %u (pushing '%s')\n", 
+	     TG_(current_fn_stack).size, new_size,
+	     fn ? fn->name : "0x0");
+
+    TG_(current_fn_stack).size = new_size;
+  }
+
+  if (fn && (*(TG_(current_fn_stack).top) == 0)) {
+    UInt *pactive;
+
+    /* this is first function: increment its active count */
+    pactive = TG_(get_fn_entry)(fn->number);
+    (*pactive)++;
+  }
+
+  TG_(current_fn_stack).top++;
+  *(TG_(current_fn_stack).top) = fn;
+  TG_(current_state).cxt = TG_(get_cxt)(TG_(current_fn_stack).top);
+
+  TG_DEBUG(5, "- push_cxt(fn '%s'): new cxt %d, fn_sp %ld\n",
+	    fn ? fn->name : "0x0",
+	    TG_(current_state).cxt ?
+	    (Int)TG_(current_state).cxt->base_number : -1,
+	    TG_(current_fn_stack).top - TG_(current_fn_stack).bottom + 0L);
+}
+			       
diff --git a/tracegrind/costs.c b/tracegrind/costs.c
new file mode 100644
index 000000000..765081b0a
--- /dev/null
+++ b/tracegrind/costs.c
@@ -0,0 +1,68 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                   ct_costs.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#include "pub_tool_mallocfree.h"
+
+#define COSTCHUNK_SIZE 100000
+
+UInt TG_(costarray_entries) = 0;
+UInt TG_(costarray_chunks) = 0;
+static CostChunk* cost_chunk_base = 0;
+static CostChunk* cost_chunk_current = 0;
+
+ULong* TG_(get_costarray)(Int size)
+{
+  ULong* ptr;
+
+  if (!cost_chunk_current ||
+      (cost_chunk_current->size - cost_chunk_current->used < size)) {
+    CostChunk* cc  = (CostChunk*) TG_MALLOC("cl.costs.gc.1",
+                                              sizeof(CostChunk) +
+					      COSTCHUNK_SIZE * sizeof(ULong));
+    TG_ASSERT(size < COSTCHUNK_SIZE);
+
+    cc->size = COSTCHUNK_SIZE;
+    cc->used = 0;
+    cc->next = 0;
+
+    if (cost_chunk_current)
+      cost_chunk_current->next = cc;
+    cost_chunk_current = cc;
+
+    if (!cost_chunk_base) cost_chunk_base = cc;
+
+    TG_(costarray_chunks)++;
+  }
+  
+  ptr = &(cost_chunk_current->data[cost_chunk_current->used]);
+  cost_chunk_current->used += size;
+
+  TG_(costarray_entries) += size;
+
+  return ptr;
+}
diff --git a/tracegrind/costs.h b/tracegrind/costs.h
new file mode 100644
index 000000000..eedf60c83
--- /dev/null
+++ b/tracegrind/costs.h
@@ -0,0 +1,55 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind cost array interface.                      costs.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2017 Josef Weidendorfer
+      josef.weidendorfer@gmx.de
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+
+#ifndef TG_COSTS
+#define TG_COSTS
+
+#include "pub_tool_basics.h"
+
+#define TG_(str) VGAPPEND(vgTracegrind_,str)
+
+extern UInt TG_(costarray_entries);
+extern UInt TG_(costarray_chunks);
+
+/* Array of 64bit costs. This is separated from other structs
+ * to support a dynamic number of costs for a cost item.
+ * Chunks are allocated on demand.
+ */
+typedef struct _CostChunk CostChunk;
+struct _CostChunk {
+  Int size;
+  Int used;
+  CostChunk *next, *prev;
+  ULong data[0];
+};
+
+/* Allocate a number of 64bit cost values.
+ * Typically used from ct_events.c */
+ULong* TG_(get_costarray)(Int size);
+
+#endif /* TG_COSTS */
diff --git a/tracegrind/debug.c b/tracegrind/debug.c
new file mode 100644
index 000000000..940a9e803
--- /dev/null
+++ b/tracegrind/debug.c
@@ -0,0 +1,460 @@
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+#include "events.h"
+
+/* If debugging mode of, dummy functions are provided (see below)
+ */
+#if TG_ENABLE_DEBUG
+
+/*------------------------------------------------------------*/
+/*--- Debug output helpers                                 ---*/
+/*------------------------------------------------------------*/
+
+static void print_indent(int s)
+{
+    /* max of 40 spaces */
+    const HChar sp[] = "                                        ";
+    if (s>40) s=40;
+    VG_(printf)("%s", sp+40-s);
+}
+
+void TG_(print_bb)(int s, BB* bb)
+{
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    VG_(printf)("BB %#lx (Obj '%s')", bb_addr(bb), bb->obj->name);
+}
+
+static
+void print_mangled_cxt(Context* cxt, int rec_index)
+{
+    int i;
+
+    if (!cxt)
+      VG_(printf)("(none)");
+    else {
+      VG_(printf)("%s", cxt->fn[0]->name);
+      if (rec_index >0)
+	VG_(printf)("'%d", rec_index +1);
+      for(i=1;i<cxt->size;i++)
+	VG_(printf)("'%s", cxt->fn[i]->name);
+    }
+}
+
+
+
+void TG_(print_cxt)(Int s, Context* cxt, int rec_index)
+{
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (cxt) {
+    UInt *pactive = TG_(get_fn_entry)(cxt->fn[0]->number);
+    TG_ASSERT(rec_index < cxt->fn[0]->separate_recursions);
+    
+    VG_(printf)("Cxt %u" ,cxt->base_number + rec_index);
+    if (*pactive>0)
+      VG_(printf)(" [active=%u]", *pactive);
+    VG_(printf)(": ");	
+    print_mangled_cxt(cxt, rec_index);
+    VG_(printf)("\n");
+  }
+  else
+    VG_(printf)("(no context)\n");
+}
+
+void TG_(print_execstate)(int s, exec_state* es)
+{
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (!es) {
+    VG_(printf)("ExecState 0x0\n");
+    return;
+  }
+
+  VG_(printf)("ExecState [Sig %d, collect %s, nonskipped %p]: jmps_passed %d\n",
+	      es->sig, es->collect?"yes":"no",
+	      es->nonskipped, es->jmps_passed);
+}
+
+
+void TG_(print_bbcc)(int s, BBCC* bbcc)
+{
+  BB* bb;
+
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (!bbcc) {
+    VG_(printf)("BBCC 0x0\n");
+    return;
+  }
+ 
+  bb = bbcc->bb;
+  TG_ASSERT(bb!=0);
+
+  VG_(printf)("%s +%#lx=%#lx, ",
+	      bb->obj->name + bb->obj->last_slash_pos,
+	      (UWord)bb->offset, bb_addr(bb));
+  TG_(print_cxt)(s+8, bbcc->cxt, bbcc->rec_index);
+}
+
+void TG_(print_eventset)(int s, EventSet* es)
+{
+    int i, j;
+    UInt mask;
+    EventGroup* eg;
+
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    if (!es) {
+	VG_(printf)("(EventSet not set)\n");
+	return;
+    }
+
+    VG_(printf)("EventSet %u (%d groups, size %d):",
+		es->mask, es->count, es->size);
+
+    if (es->count == 0) {
+	VG_(printf)("-\n");
+	return;
+    }
+
+    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
+	if ((es->mask & mask)==0) continue;
+	eg = TG_(get_event_group)(i);
+	if (!eg) continue;
+	VG_(printf)(" (%d: %s", i, eg->name[0]);
+	for(j=1; j<eg->size; j++)
+	    VG_(printf)(" %s", eg->name[j]);
+	VG_(printf)(")");
+    }
+    VG_(printf)("\n");
+}
+
+
+void TG_(print_cost)(int s, EventSet* es, ULong* c)
+{
+    Int i, j, pos, off;
+    UInt mask;
+    EventGroup* eg;
+
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    if (!es) {
+      VG_(printf)("Cost (Nothing, EventSet not set)\n");
+      return;
+    }
+    if (!c) {
+      VG_(printf)("Cost (Null, EventSet %u)\n", es->mask);
+      return;
+    }
+
+    if (es->size == 0) {
+      VG_(printf)("Cost (Nothing, EventSet with len 0)\n");
+      return;
+    } 
+
+    pos = s;
+    pos += VG_(printf)("Cost [%p]: ", c);
+    off = 0;
+    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
+	if ((es->mask & mask)==0) continue;
+	eg = TG_(get_event_group)(i);
+	if (!eg) continue;
+	for(j=0; j<eg->size; j++) {
+
+	    if (off>0) {
+		if (pos > 70) {
+		    VG_(printf)(",\n");
+		    print_indent(s+5);
+		    pos = s+5;
+		}
+		else
+		    pos += VG_(printf)(", ");
+	    }
+
+	    pos += VG_(printf)("%s %llu", eg->name[j], c[off++]);
+	}
+    }
+    VG_(printf)("\n");
+}
+
+
+void TG_(print_short_jcc)(jCC* jcc)
+{
+    if (jcc)
+	VG_(printf)("%#lx => %#lx [calls %llu/Ir %llu, Dr %llu, Dw %llu]",
+		    bb_jmpaddr(jcc->from->bb),
+		    bb_addr(jcc->to->bb),
+		    jcc->call_counter,
+		    jcc->cost ? jcc->cost[fullOffset(EG_IR)]:0,
+		    jcc->cost ? jcc->cost[fullOffset(EG_DR)]:0,
+		    jcc->cost ? jcc->cost[fullOffset(EG_DW)]:0);
+    else
+	VG_(printf)("[Skipped JCC]");
+}
+
+void TG_(print_jcc)(int s, jCC* jcc)
+{
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    if (!jcc) {
+	VG_(printf)("JCC to skipped function\n");
+	return;
+    }
+    VG_(printf)("JCC %p from ", jcc);
+    TG_(print_bbcc)(s+9, jcc->from);
+    print_indent(s+4);    
+    VG_(printf)("to   ");
+    TG_(print_bbcc)(s+9, jcc->to);
+    print_indent(s+4);
+    VG_(printf)("Calls %llu\n", jcc->call_counter);
+    print_indent(s+4);
+    TG_(print_cost)(s+9, TG_(sets).full, jcc->cost);
+}
+
+/* dump out the current call stack */
+void TG_(print_stackentry)(int s, int sp)
+{
+    call_entry* ce;
+
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    ce = TG_(get_call_entry)(sp);
+    VG_(printf)("[%-2d] SP %#lx, RA %#lx", sp, ce->sp, ce->ret_addr);
+    if (ce->nonskipped)
+	VG_(printf)(" NonSkipped BB %#lx / %s",
+		    bb_addr(ce->nonskipped->bb),
+		    ce->nonskipped->cxt->fn[0]->name);
+    VG_(printf)("\n");
+    print_indent(s+5);
+    TG_(print_jcc)(5,ce->jcc);
+}
+
+/* debug output */
+#if 0
+static void print_call_stack()
+{
+    int c;
+
+    VG_(printf)("Call Stack:\n");
+    for(c=0;c<TG_(current_call_stack).sp;c++)
+      TG_(print_stackentry)(-2, c);
+}
+#endif
+
+void TG_(print_bbcc_fn)(BBCC* bbcc)
+{
+    obj_node* obj;
+
+    if (!bbcc) {
+	VG_(printf)("%08x", 0u);
+	return;
+    }
+
+    VG_(printf)("%08lx/%c  %u:", bb_addr(bbcc->bb), 
+		(bbcc->bb->sect_kind == Vg_SectText) ? 'T' :
+		(bbcc->bb->sect_kind == Vg_SectData) ? 'D' :
+		(bbcc->bb->sect_kind == Vg_SectBSS) ? 'B' :
+		(bbcc->bb->sect_kind == Vg_SectGOT) ? 'G' :
+		(bbcc->bb->sect_kind == Vg_SectPLT) ? 'P' : 'U',
+		bbcc->cxt->base_number+bbcc->rec_index);
+    print_mangled_cxt(bbcc->cxt, bbcc->rec_index);
+
+    obj = bbcc->cxt->fn[0]->file->obj;
+    if (obj->name[0])
+	VG_(printf)(" %s", obj->name+obj->last_slash_pos);
+
+    if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") !=0) {
+	VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name);
+	if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line>0))
+	    VG_(printf)(":%u", bbcc->bb->line);
+    }
+}	
+
+void TG_(print_bbcc_cost)(int s, BBCC* bbcc)
+{
+  BB* bb;
+  Int i, cjmpNo;
+  ULong ecounter;
+
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (!bbcc) {
+    VG_(printf)("BBCC 0x0\n");
+    return;
+  }
+ 
+  bb = bbcc->bb;
+  TG_ASSERT(bb!=0);
+    
+  TG_(print_bbcc)(s, bbcc);
+
+  ecounter = bbcc->ecounter_sum;
+
+  print_indent(s+2);
+  VG_(printf)("ECounter: sum %llu ", ecounter);
+  for(i=0; i<bb->cjmp_count; i++) {
+      VG_(printf)("[%u]=%llu ",
+		  bb->jmp[i].instr, bbcc->jmp[i].ecounter);
+  }
+  VG_(printf)("\n");
+
+  cjmpNo = 0; 
+  for(i=0; i<bb->instr_count; i++) {
+      InstrInfo* ii = &(bb->instr[i]);
+      print_indent(s+2);
+      VG_(printf)("[%2d] IOff %2u ecnt %3llu ",
+		  i, ii->instr_offset, ecounter);
+      TG_(print_cost)(s+5, ii->eventset, bbcc->cost + ii->cost_offset);
+
+      /* update execution counter */
+      if (cjmpNo < bb->cjmp_count)
+	  if (bb->jmp[cjmpNo].instr == i) {
+	      ecounter -= bbcc->jmp[cjmpNo].ecounter;
+	      cjmpNo++;
+	  }
+  }
+}
+
+
+/* dump out an address with source info if available */
+void TG_(print_addr)(Addr addr)
+{
+    const HChar *fn_buf, *fl_buf, *dir_buf;
+    const HChar* obj_name;
+    DebugInfo* di;
+    UInt ln, i=0, opos=0;
+	
+    if (addr == 0) {
+	VG_(printf)("%08lx", addr);
+	return;
+    }
+
+    TG_(get_debug_info)(addr, &dir_buf, &fl_buf, &fn_buf, &ln, &di);
+
+    if (VG_(strcmp)(fn_buf,"???")==0)
+	VG_(printf)("%#lx", addr);
+    else
+	VG_(printf)("%#lx %s", addr, fn_buf);
+
+    if (di) {
+      obj_name = VG_(DebugInfo_get_filename)(di);
+      if (obj_name) {
+	while(obj_name[i]) {
+	  if (obj_name[i]=='/') opos = i+1;
+	  i++;
+	}
+	if (obj_name[0])
+	  VG_(printf)(" %s", obj_name+opos);
+      }
+    }
+
+    if (ln>0) {
+       if (dir_buf[0])
+          VG_(printf)(" (%s/%s:%u)", dir_buf, fl_buf, ln);
+       else
+          VG_(printf)(" (%s:%u)", fl_buf, ln);
+    }
+}
+
+void TG_(print_addr_ln)(Addr addr)
+{
+  TG_(print_addr)(addr);
+  VG_(printf)("\n");
+}
+
+static ULong bb_written = 0;
+
+void TG_(print_bbno)(void)
+{
+  if (bb_written != TG_(stat).bb_executions) {
+    bb_written = TG_(stat).bb_executions;
+    VG_(printf)("BB# %llu\n",TG_(stat).bb_executions);
+  }
+}
+
+void TG_(print_context)(void)
+{
+  BBCC* bbcc;
+
+  TG_DEBUG(0,"In tid %u [%d] ",
+	   TG_(current_tid),  TG_(current_call_stack).sp);
+  bbcc =  TG_(current_state).bbcc;
+  print_mangled_cxt(TG_(current_state).cxt,
+		    bbcc ? bbcc->rec_index : 0);
+  VG_(printf)("\n");
+}
+
+void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f)
+{
+    TG_DEBUG(3, "Malloc(%lu) in %s.\n", s, f);
+    return VG_(malloc)(cc,s);
+}
+
+#else /* TG_ENABLE_DEBUG */
+
+void TG_(print_bbno)(void) {}
+void TG_(print_context)(void) {}
+void TG_(print_jcc)(int s, jCC* jcc) {}
+void TG_(print_bbcc)(int s, BBCC* bbcc) {}
+void TG_(print_bbcc_fn)(BBCC* bbcc) {}
+void TG_(print_cost)(int s, EventSet* es, ULong* cost) {}
+void TG_(print_bb)(int s, BB* bb) {}
+void TG_(print_cxt)(int s, Context* cxt, int rec_index) {}
+void TG_(print_short_jcc)(jCC* jcc) {}
+void TG_(print_stackentry)(int s, int sp) {}
+void TG_(print_addr)(Addr addr) {}
+void TG_(print_addr_ln)(Addr addr) {}
+
+#endif
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
new file mode 100644
index 000000000..cd602e7fd
--- /dev/null
+++ b/tracegrind/dump.c
@@ -0,0 +1,1763 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                       dump.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "global.h"
+
+#include "pub_tool_threadstate.h"
+#include "pub_tool_libcfile.h"
+
+
+/* Dump Part Counter */
+static Int out_counter = 0;
+
+static HChar* out_file = 0;
+static Bool dumps_initialized = False;
+
+/* Command */
+static HChar *cmdbuf;
+
+/* Total reads/writes/misses sum over all dumps and threads.
+ * Updated during CC traversal at dump time.
+ */
+FullCost TG_(total_cost) = 0;
+static FullCost dump_total_cost = 0;
+
+EventMapping* TG_(dumpmap) = 0;
+
+Int TG_(get_dump_counter)(void)
+{
+  return out_counter;
+}
+
+/*------------------------------------------------------------*/
+/*--- Output file related stuff                            ---*/
+/*------------------------------------------------------------*/
+
+/* Boolean dumping array */
+static Bool* dump_array = 0;
+static Int   dump_array_size = 0;
+static Bool* obj_dumped = 0;
+static Bool* file_dumped = 0;
+static Bool* fn_dumped = 0;
+static Bool* cxt_dumped = 0;
+
+static
+void reset_dump_array(void)
+{
+    int i;
+
+    TG_ASSERT(dump_array != 0);
+
+    for(i=0;i<dump_array_size;i++)
+	dump_array[i] = False;
+}
+
+static
+void init_dump_array(void)
+{
+    dump_array_size = TG_(stat).distinct_objs +
+      TG_(stat).distinct_files +
+      TG_(stat).distinct_fns +
+      TG_(stat).context_counter;
+    TG_ASSERT(dump_array == 0);
+    dump_array = (Bool*) TG_MALLOC("cl.dump.ida.1",
+                                    dump_array_size * sizeof(Bool));
+    obj_dumped  = dump_array;
+    file_dumped = obj_dumped + TG_(stat).distinct_objs;
+    fn_dumped   = file_dumped + TG_(stat).distinct_files;
+    cxt_dumped  = fn_dumped + TG_(stat).distinct_fns;
+
+    reset_dump_array();
+
+    TG_DEBUG(1, "  init_dump_array: size %d\n", dump_array_size);
+}
+
+static __inline__
+void free_dump_array(void)
+{
+    TG_ASSERT(dump_array != 0);
+    VG_(free)(dump_array);
+
+    dump_array = 0;
+    obj_dumped = 0;
+    file_dumped = 0;
+    fn_dumped = 0;
+    cxt_dumped = 0;
+}
+
+
+/* Initialize to an invalid position */
+static __inline__
+void init_fpos(FnPos* p)
+ {
+    p->file = 0;
+    p->fn = 0;
+    p->obj = 0;
+    p->cxt = 0;
+    p->rec_index = 0;
+}
+
+
+static void print_obj(VgFile *fp, const HChar* prefix, obj_node* obj)
+{
+    if (TG_(clo).compress_strings) {
+	TG_ASSERT(obj_dumped != 0);
+	if (obj_dumped[obj->number])
+            VG_(fprintf)(fp, "%s(%u)\n", prefix, obj->number);
+	else {
+            VG_(fprintf)(fp, "%s(%u) %s\n", prefix, obj->number, obj->name);
+	}
+    }
+    else
+        VG_(fprintf)(fp, "%s%s\n", prefix, obj->name);
+
+#if 0
+    /* add mapping parameters the first time a object is dumped
+     * format: mp=0xSTART SIZE 0xOFFSET */
+    if (!obj_dumped[obj->number]) {
+	obj_dumped[obj->number];
+	VG_(fprintf)(fp, "mp=%p %p %p\n",
+		     pos->obj->start, pos->obj->size, pos->obj->offset);
+    }
+#else
+    obj_dumped[obj->number] = True;
+#endif
+}
+
+static void print_file(VgFile *fp, const char *prefix, const file_node* file)
+{
+    if (TG_(clo).compress_strings) {
+	TG_ASSERT(file_dumped != 0);
+	if (file_dumped[file->number])
+            VG_(fprintf)(fp, "%s(%u)\n", prefix, file->number);
+	else {
+            VG_(fprintf)(fp, "%s(%u) %s\n", prefix, file->number, file->name);
+	    file_dumped[file->number] = True;
+	}
+    }
+    else
+        VG_(fprintf)(fp, "%s%s\n", prefix, file->name);
+}
+
+/*
+ * tag can be "fn", "cfn", "jfn"
+ */
+static void print_fn(VgFile *fp, const HChar* tag, const fn_node* fn)
+{
+    VG_(fprintf)(fp, "%s=",tag);
+    if (TG_(clo).compress_strings) {
+	TG_ASSERT(fn_dumped != 0);
+	if (fn_dumped[fn->number])
+	    VG_(fprintf)(fp, "(%u)\n", fn->number);
+	else {
+	    VG_(fprintf)(fp, "(%u) %s\n", fn->number, fn->name);
+	    fn_dumped[fn->number] = True;
+	}
+    }
+    else
+        VG_(fprintf)(fp, "%s\n", fn->name);
+}
+
+static void print_mangled_fn(VgFile *fp, const HChar* tag, 
+			     Context* cxt, int rec_index)
+{
+    int i;
+
+    if (TG_(clo).compress_strings && TG_(clo).compress_mangled) {
+
+	int n;
+	Context* last;
+
+	TG_ASSERT(cxt_dumped != 0);
+	if (cxt_dumped[cxt->base_number+rec_index]) {
+            VG_(fprintf)(fp, "%s=(%u)\n",
+			     tag, cxt->base_number + rec_index);
+	    return;
+	}
+
+	last = 0;
+	/* make sure that for all context parts compressed data is written */
+	for(i=cxt->size;i>0;i--) {
+	    TG_ASSERT(cxt->fn[i-1]->pure_cxt != 0);
+	    n = cxt->fn[i-1]->pure_cxt->base_number;
+	    if (cxt_dumped[n]) continue;
+	    VG_(fprintf)(fp, "%s=(%d) %s\n",
+			     tag, n, cxt->fn[i-1]->name);
+
+	    cxt_dumped[n] = True;
+	    last = cxt->fn[i-1]->pure_cxt;
+	}
+	/* If the last context was the context to print, we are finished */
+	if ((last == cxt) && (rec_index == 0)) return;
+
+	VG_(fprintf)(fp, "%s=(%u) (%u)", tag,
+			 cxt->base_number + rec_index,
+			 cxt->fn[0]->pure_cxt->base_number);
+	if (rec_index >0)
+	    VG_(fprintf)(fp, "'%d", rec_index +1);
+	for(i=1;i<cxt->size;i++)
+	    VG_(fprintf)(fp, "'(%u)", 
+			      cxt->fn[i]->pure_cxt->base_number);
+	VG_(fprintf)(fp, "\n");
+
+	cxt_dumped[cxt->base_number+rec_index] = True;
+	return;
+    }
+
+
+    VG_(fprintf)(fp, "%s=", tag);
+    if (TG_(clo).compress_strings) {
+	TG_ASSERT(cxt_dumped != 0);
+	if (cxt_dumped[cxt->base_number+rec_index]) {
+	    VG_(fprintf)(fp, "(%u)\n", cxt->base_number + rec_index);
+	    return;
+	}
+	else {
+	    VG_(fprintf)(fp, "(%u) ", cxt->base_number + rec_index);
+	    cxt_dumped[cxt->base_number+rec_index] = True;
+	}
+    }
+
+    VG_(fprintf)(fp, "%s", cxt->fn[0]->name);
+    if (rec_index >0)
+	VG_(fprintf)(fp, "'%d", rec_index +1);
+    for(i=1;i<cxt->size;i++)
+	VG_(fprintf)(fp, "'%s", cxt->fn[i]->name);
+
+    VG_(fprintf)(fp, "\n");
+}
+
+
+
+/**
+ * Print function position of the BBCC, but only print info differing to
+ * the <last> position, update <last>
+ * Return True if something changes.
+ */
+static Bool print_fn_pos(VgFile *fp, FnPos* last, BBCC* bbcc)
+{
+    Bool res = False;
+
+    TG_ASSERT(bbcc && bbcc->cxt);
+
+    TG_DEBUGIF(3) {
+	TG_DEBUG(2, "+ print_fn_pos: ");
+	TG_(print_cxt)(16, bbcc->cxt, bbcc->rec_index);
+    }
+
+    if (!TG_(clo).mangle_names) {
+	if (last->rec_index != bbcc->rec_index) {
+	    VG_(fprintf)(fp, "rec=%u\n\n", bbcc->rec_index);
+	    last->rec_index = bbcc->rec_index;
+	    last->cxt = 0; /* reprint context */
+	    res = True;
+	}
+	
+	if (last->cxt != bbcc->cxt) {
+	    fn_node* last_from = (last->cxt && last->cxt->size >1) ?
+				 last->cxt->fn[1] : 0;
+	    fn_node* curr_from = (bbcc->cxt->size >1) ?
+				 bbcc->cxt->fn[1] : 0;
+	    if (curr_from == 0) {
+		if (last_from != 0) {
+		    /* switch back to no context */
+		    VG_(fprintf)(fp, "frfn=(spontaneous)\n");
+		    res = True;
+		}
+	    }
+	    else if (last_from != curr_from) {
+		print_fn(fp, "frfn", curr_from);
+		res = True;
+	    }
+	    last->cxt = bbcc->cxt;
+	}
+    }
+
+    if (last->obj != bbcc->cxt->fn[0]->file->obj) {
+	print_obj(fp, "ob=", bbcc->cxt->fn[0]->file->obj);
+	last->obj = bbcc->cxt->fn[0]->file->obj;
+	res = True;
+    }
+
+    if (last->file != bbcc->cxt->fn[0]->file) {
+        print_file(fp, "fl=", bbcc->cxt->fn[0]->file);
+	last->file = bbcc->cxt->fn[0]->file;
+	res = True;
+    }
+
+    if (!TG_(clo).mangle_names) {
+	if (last->fn != bbcc->cxt->fn[0]) {
+	    print_fn(fp, "fn", bbcc->cxt->fn[0]);
+	    last->fn = bbcc->cxt->fn[0];
+	    res = True;
+	}
+    }
+    else {
+	/* Print mangled name if context or rec_index changes */
+	if ((last->rec_index != bbcc->rec_index) ||
+	    (last->cxt != bbcc->cxt)) {
+
+	    print_mangled_fn(fp, "fn", bbcc->cxt, bbcc->rec_index);
+	    last->fn = bbcc->cxt->fn[0];
+	    last->rec_index = bbcc->rec_index;
+	    res = True;
+	}
+    }
+
+    last->cxt = bbcc->cxt;
+
+    TG_DEBUG(2, "- print_fn_pos: %s\n", res ? "changed" : "");
+    
+    return res;
+}
+
+/* the debug lookup cache is useful if BBCC for same BB are
+ * dumped directly in a row. This is a direct mapped cache.
+ */
+#define DEBUG_CACHE_SIZE 1777
+
+static Addr       debug_cache_addr[DEBUG_CACHE_SIZE];
+static file_node* debug_cache_file[DEBUG_CACHE_SIZE];
+static int        debug_cache_line[DEBUG_CACHE_SIZE];
+static Bool       debug_cache_info[DEBUG_CACHE_SIZE];
+static const HChar* debug_cache_inlfn[DEBUG_CACHE_SIZE];
+
+static __inline__
+void init_debug_cache(void)
+{
+    int i;
+    for(i=0;i<DEBUG_CACHE_SIZE;i++) {
+	debug_cache_addr[i] = 0;
+	debug_cache_file[i] = 0;
+	debug_cache_line[i] = 0;
+	debug_cache_info[i] = 0;
+	debug_cache_inlfn[i] = 0;
+    }
+}
+
+static /* __inline__ */
+Bool get_debug_pos(BBCC* bbcc, Addr addr, AddrPos* p)
+{
+    const HChar *file, *dir;
+    Bool found_file_line;
+
+    int cachepos = addr % DEBUG_CACHE_SIZE;
+    
+    if (debug_cache_addr[cachepos] == addr) {
+	p->line = debug_cache_line[cachepos];
+	p->file = debug_cache_file[cachepos];
+	found_file_line = debug_cache_info[cachepos];
+    }
+    else {
+        DiEpoch ep = VG_(current_DiEpoch)();
+	found_file_line = VG_(get_filename_linenum)(ep, addr,
+						    &file,
+						    &dir,
+						    &(p->line));
+	if (!found_file_line) {
+            file = "???";
+	    p->line = 0;
+	}
+	p->file    = TG_(get_file_node)(bbcc->bb->obj, dir, file);
+
+	debug_cache_info[cachepos] = found_file_line;
+	debug_cache_addr[cachepos] = addr;
+	debug_cache_line[cachepos] = p->line;
+	debug_cache_file[cachepos] = p->file;
+
+	/* Query inline info at the same time we query file/line */
+	const HChar* inl_fn = 0;
+	Bool has_inline = VG_(get_inline_fnname)(ep, addr, &inl_fn);
+	if (has_inline) {
+	    debug_cache_inlfn[cachepos] = inl_fn;
+	} else {
+	    debug_cache_inlfn[cachepos] = (const HChar*)(-1);
+	}
+    }
+
+    /* Address offset from bbcc start address */
+    p->addr = addr - bbcc->bb->obj->offset;
+    p->bb_addr = bbcc->bb->offset;
+
+    TG_DEBUG(3, "  get_debug_pos(%#lx): BB %#lx, fn '%s', file '%s', line %u\n",
+	     addr, bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name,
+	     p->file->name, p->line);
+
+    return found_file_line;
+}
+
+/* Get inline function name for an address, with caching.
+ * Returns True if address is in an inlined function, False otherwise.
+ * If True, *inl_fn will be set to the inline function name.
+ */
+static Bool get_inline_info(Addr addr, const HChar** inl_fn)
+{
+    int cachepos = addr % DEBUG_CACHE_SIZE;
+
+    /* Check cache first - but only if inline info was already queried for this address */
+    if (debug_cache_addr[cachepos] == addr && debug_cache_inlfn[cachepos] != 0) {
+        /* We have cached inline info for this address */
+        if (debug_cache_inlfn[cachepos] == (const HChar*)(-1)) {
+            /* Special marker: no inline function at this address */
+            *inl_fn = 0;
+            return False;
+        }
+        *inl_fn = debug_cache_inlfn[cachepos];
+        return True;
+    }
+
+    DiEpoch ep = VG_(current_DiEpoch)();
+    Bool has_inline = VG_(get_inline_fnname)(ep, addr, inl_fn);
+
+    if (has_inline) {
+        /* Cache the inline function name */
+        debug_cache_inlfn[cachepos] = *inl_fn;
+    } else {
+        *inl_fn = 0;
+        /* Use special marker -1 to indicate "no inline function" */
+        debug_cache_inlfn[cachepos] = (const HChar*)(-1);
+    }
+
+    TG_DEBUG(3, "  get_inline_info(%#lx): %s\n",
+             addr, has_inline ? *inl_fn : "(not inlined)");
+
+    return has_inline;
+}
+
+
+/* copy file position and init cost */
+static void init_apos(AddrPos* p, Addr addr, Addr bbaddr, file_node* file)
+{
+    p->addr    = addr;
+    p->bb_addr = bbaddr;
+    p->file    = file;
+    p->line    = 0;
+}
+
+static void copy_apos(AddrPos* dst, AddrPos* src)
+{
+    dst->addr    = src->addr;
+    dst->bb_addr = src->bb_addr;
+    dst->file    = src->file;
+    dst->line    = src->line;
+}   
+
+/* copy file position and init cost */
+static void init_fcost(AddrCost* c, Addr addr, Addr bbaddr, file_node* file)
+{
+    init_apos( &(c->p), addr, bbaddr, file);
+    /* FIXME: This is a memory leak as a AddrCost is inited multiple times */
+    c->cost = TG_(get_eventset_cost)( TG_(sets).full );
+    TG_(init_cost)( TG_(sets).full, c->cost );
+}
+
+/* Track last inline function to avoid repeated cfni= output */
+static const HChar* last_inline_fn = 0;
+
+/**
+ * print position change inside of a BB (last -> curr)
+ * this doesn't update last to curr!
+ */
+static void fprint_apos(VgFile *fp, AddrPos* curr, AddrPos* last,
+                        file_node* func_file, BBCC* bbcc)
+{
+    TG_ASSERT(curr->file != 0);
+    TG_DEBUG(2, "    print_apos(file '%s', line %u, bb %#lx, addr %#lx) fnFile '%s'\n",
+	     curr->file->name, curr->line, curr->bb_addr, curr->addr,
+	     func_file->name);
+
+    if (curr->file != last->file) {
+
+	/* if we switch back to orig file, use fe=... */
+	if (curr->file == func_file)
+            print_file(fp, "fe=", curr->file);
+	else
+            print_file(fp, "fi=", curr->file);
+    }
+
+    /* Check inline function for this position and output cfni= if changed */
+    if (bbcc) {
+        Addr curr_addr = curr->addr + bbcc->bb->obj->offset;
+        const HChar* inline_fn = 0;
+        Bool is_inline = get_inline_info(curr_addr, &inline_fn);
+
+        /* Output cfni= if inline function changed */
+        if (is_inline && inline_fn && inline_fn != last_inline_fn) {
+            VG_(fprintf)(fp, "cfni=%s\n", inline_fn);
+            last_inline_fn = inline_fn;
+        }
+        /* Clear last_inline_fn if we're no longer in inline code */
+        else if (!is_inline && last_inline_fn) {
+            VG_(fprintf)(fp, "cfni=???\n");
+            last_inline_fn = 0;
+        }
+    }
+
+    if (TG_(clo).dump_bbs) {
+	if (curr->line != last->line) {
+	    VG_(fprintf)(fp, "ln=%u\n", curr->line);
+	}
+    }
+}
+
+
+
+/**
+ * Print a position.
+ * This prints out differences if allowed
+ *
+ * This doesn't set last to curr afterwards!
+ */
+static
+void fprint_pos(VgFile *fp, const AddrPos* curr, const AddrPos* last)
+{
+    if (0) //TG_(clo).dump_bbs)
+	VG_(fprintf)(fp, "%lu ", curr->addr - curr->bb_addr);
+    else {
+	if (TG_(clo).dump_instr) {
+	    int diff = curr->addr - last->addr;
+	    if ( TG_(clo).compress_pos && (last->addr >0) && 
+		 (diff > -100) && (diff < 100)) {
+		if (diff >0)
+		    VG_(fprintf)(fp, "+%d ", diff);
+		else if (diff==0)
+		    VG_(fprintf)(fp, "* ");
+	        else
+		    VG_(fprintf)(fp, "%d ", diff);
+	    }
+	    else
+		VG_(fprintf)(fp, "%#lx ", curr->addr);
+	}
+
+	if (TG_(clo).dump_bb) {
+	    int diff = curr->bb_addr - last->bb_addr;
+	    if ( TG_(clo).compress_pos && (last->bb_addr >0) && 
+		 (diff > -100) && (diff < 100)) {
+		if (diff >0)
+		    VG_(fprintf)(fp, "+%d ", diff);
+		else if (diff==0)
+		    VG_(fprintf)(fp, "* ");
+	        else
+		    VG_(fprintf)(fp, "%d ", diff);
+	    }
+	    else
+		VG_(fprintf)(fp, "%#lx ", curr->bb_addr);
+	}
+
+	if (TG_(clo).dump_line) {
+	    int diff = curr->line - last->line;
+	    if ( TG_(clo).compress_pos && (last->line >0) && 
+		 (diff > -100) && (diff < 100)) {
+
+		if (diff >0)
+		    VG_(fprintf)(fp, "+%d ", diff);
+		else if (diff==0)
+		    VG_(fprintf)(fp, "* ");
+	        else
+		    VG_(fprintf)(fp, "%d ", diff);
+	    }
+	    else
+		VG_(fprintf)(fp, "%u ", curr->line);
+	}
+    }
+}
+
+
+/**
+ * Print events.
+ */
+
+static
+void fprint_cost(VgFile *fp, const EventMapping* es, const ULong* cost)
+{
+  HChar *mcost = TG_(mappingcost_as_string)(es, cost);
+  VG_(fprintf)(fp, "%s\n", mcost);
+  TG_FREE(mcost);
+}
+
+
+
+/* Write the cost of a source line; only that parts of the source
+ * position are written that changed relative to last written position.
+ * funcPos is the source position of the first line of actual function.
+ * Something is written only if cost != 0; returns True in this case.
+ */
+static void fprint_fcost(VgFile *fp, AddrCost* c, AddrPos* last)
+{
+  TG_DEBUGIF(3) {
+    TG_DEBUG(2, "   print_fcost(file '%s', line %u, bb %#lx, addr %#lx):\n",
+	     c->p.file->name, c->p.line, c->p.bb_addr, c->p.addr);
+    TG_(print_cost)(-5, TG_(sets).full, c->cost);
+  }
+    
+  fprint_pos(fp, &(c->p), last);
+  copy_apos( last, &(c->p) ); /* update last to current position */
+
+  fprint_cost(fp, TG_(dumpmap), c->cost);
+
+  /* add cost to total */
+  TG_(add_and_zero_cost)( TG_(sets).full, dump_total_cost, c->cost );
+}
+
+
+/* Write out the calls from jcc (at pos)
+ */
+static void fprint_jcc(VgFile *fp, jCC* jcc, AddrPos* curr, AddrPos* last,
+                       ULong ecounter)
+{
+    static AddrPos target;
+    file_node* file;
+    obj_node*  obj;
+
+    TG_DEBUGIF(2) {
+      TG_DEBUG(2, "   fprint_jcc (jkind %d)\n", (Int)jcc->jmpkind);
+      TG_(print_jcc)(-10, jcc);
+    }
+
+    TG_ASSERT(jcc->to !=0);
+    TG_ASSERT(jcc->from !=0);
+    
+    if (!get_debug_pos(jcc->to, bb_addr(jcc->to->bb), &target)) {
+	/* if we don't have debug info, don't switch to file "???" */
+	target.file = last->file;
+    }
+
+    if ((jcc->jmpkind == jk_CondJump) || (jcc->jmpkind == jk_Jump)) {
+	    
+      /* this is a JCC for a followed conditional or boring jump. */
+      TG_ASSERT(TG_(is_zero_cost)( TG_(sets).full, jcc->cost));
+	
+      /* objects among jumps should be the same.
+       * Otherwise this jump would have been changed to a call
+       *  (see setup_bbcc)
+       */
+      TG_ASSERT(jcc->from->bb->obj == jcc->to->bb->obj);
+
+	/* only print if target position info is useful */
+	if (!TG_(clo).dump_instr && !TG_(clo).dump_bb && target.line==0) {
+	  jcc->call_counter = 0;
+	  return;
+	}
+
+	/* Different files/functions are possible e.g. with longjmp's
+	 * which change the stack, and thus context
+	 */
+	if (last->file != target.file) {
+            print_file(fp, "jfi=", target.file);
+	}
+	
+	if (jcc->from->cxt != jcc->to->cxt) {
+	    if (TG_(clo).mangle_names)
+		print_mangled_fn(fp, "jfn",
+				 jcc->to->cxt, jcc->to->rec_index);
+	    else
+		print_fn(fp, "jfn", jcc->to->cxt->fn[0]);
+	}
+	    
+	if (jcc->jmpkind == jk_CondJump) {
+	    /* format: jcnd=<followed>/<executions> <target> */
+	    VG_(fprintf)(fp, "jcnd=%llu/%llu ",
+			 jcc->call_counter, ecounter);
+	}
+	else {
+	    /* format: jump=<jump count> <target> */
+	    VG_(fprintf)(fp, "jump=%llu ",
+			 jcc->call_counter);
+	}
+		
+	fprint_pos(fp, &target, last);
+	VG_(fprintf)(fp, "\n");
+	fprint_pos(fp, curr, last);
+	VG_(fprintf)(fp, "\n");
+
+	jcc->call_counter = 0;
+	return;
+    }
+
+    file = jcc->to->cxt->fn[0]->file;
+    obj  = jcc->to->bb->obj;
+    
+    /* object of called position different to object of this function?*/
+    if (jcc->from->cxt->fn[0]->file->obj != obj) {
+	print_obj(fp, "cob=", obj);
+    }
+
+    /* file of called position different to current file? */
+    if (last->file != file) {
+        print_file(fp, "cfi=", file);
+    }
+
+    if (TG_(clo).mangle_names)
+	print_mangled_fn(fp, "cfn", jcc->to->cxt, jcc->to->rec_index);
+    else
+	print_fn(fp, "cfn", jcc->to->cxt->fn[0]);
+
+    if (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost)) {
+        VG_(fprintf)(fp, "calls=%llu ", 
+		   jcc->call_counter);
+
+	fprint_pos(fp, &target, last);
+        VG_(fprintf)(fp, "\n");
+	fprint_pos(fp, curr, last);
+	fprint_cost(fp, TG_(dumpmap), jcc->cost);
+
+	TG_(init_cost)( TG_(sets).full, jcc->cost );
+
+	jcc->call_counter = 0;
+    }
+}
+
+
+
+/* Cost summation of functions.We use alternately ccSum[0/1], thus
+ * ssSum[currSum] for recently read lines with same line number.
+ */
+static AddrCost ccSum[2];
+static int currSum;
+
+/* Merge two sorted jCC lists.
+ * Assumes both input lists are sorted by creation_seq.
+ * Returns a new merged list that is also sorted by creation_seq.
+ */
+static jCC* merge_jcc_lists(jCC* left, jCC* right) {
+    jCC dummy;
+    dummy.next_from = NULL;
+    jCC* tail = &dummy;
+
+    while (left && right) {
+        if (left->creation_seq <= right->creation_seq) {
+            tail->next_from = left;
+            left = left->next_from;
+        } else {
+            tail->next_from = right;
+            right = right->next_from;
+        }
+        tail = tail->next_from;
+    }
+
+    tail->next_from = left ? left : right;
+    return dummy.next_from;
+}
+
+/* Merge sort for jCC lists to ensure chronological dump order.
+ * Sorts by creation_seq field to preserve execution order.
+ */
+static jCC* sort_jcc_list(jCC* head) {
+    if (!head || !head->next_from) return head;
+
+    /* Split list into two halves using slow/fast pointer technique */
+    jCC* slow = head;
+    jCC* fast = head->next_from;
+
+    while (fast && fast->next_from) {
+        slow = slow->next_from;
+        fast = fast->next_from->next_from;
+    }
+
+    /* Split at midpoint */
+    jCC* mid = slow->next_from;
+    slow->next_from = NULL;
+
+    /* Recursively sort both halves and merge */
+    return merge_jcc_lists(sort_jcc_list(head), sort_jcc_list(mid));
+}
+
+/*
+ * Print all costs of a BBCC:
+ * - FCCs of instructions
+ * - JCCs of the unique jump of this BB
+ * returns True if something was written 
+ */
+static Bool fprint_bbcc(VgFile *fp, BBCC* bbcc, AddrPos* last)
+{
+  InstrInfo* instr_info;
+  ULong ecounter;
+  Bool something_written = False;
+  jCC* jcc;
+  AddrCost *currCost, *newCost;
+  Int jcc_count = 0, instr, i, jmp;
+  BB* bb = bbcc->bb;
+
+  TG_ASSERT(bbcc->cxt != 0);
+  TG_DEBUGIF(1) {
+    VG_(printf)("+ fprint_bbcc (Instr %u): ", bb->instr_count);
+    TG_(print_bbcc)(15, bbcc);
+  }
+
+  TG_ASSERT(currSum == 0 || currSum == 1);
+  currCost = &(ccSum[currSum]);
+  newCost  = &(ccSum[1-currSum]);
+
+  ecounter = bbcc->ecounter_sum;
+  jmp = 0;
+  instr_info = &(bb->instr[0]);
+  for(instr=0; instr<bb->instr_count; instr++, instr_info++) {
+
+    /* get debug info of current instruction address and dump cost
+     * if TG_(clo).dump_bbs or file/line has changed
+     */
+    Addr instr_addr = bb_addr(bb) + instr_info->instr_offset;
+    if (!get_debug_pos(bbcc, instr_addr, &(newCost->p))) {
+      /* if we don't have debug info, don't switch to file "???" */
+      newCost->p.file = bbcc->cxt->fn[0]->file;
+    }
+
+    if (TG_(clo).dump_bbs || TG_(clo).dump_instr ||
+	(newCost->p.line != currCost->p.line) ||
+	(newCost->p.file != currCost->p.file)) {
+      
+      if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
+	something_written = True;
+
+	/* Output file position and inline function markers */
+	fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
+
+	fprint_fcost(fp, currCost, last);
+      }
+	   
+      /* switch buffers */
+      currSum = 1 - currSum;
+      currCost = &(ccSum[currSum]);
+      newCost  = &(ccSum[1-currSum]);
+    }
+       
+    /* add line cost to current cost sum */
+    (*TG_(cachesim).add_icost)(currCost->cost, bbcc, instr_info, ecounter);
+
+    /* print jcc's if there are: only jumps */
+    if (bb->jmp[jmp].instr == instr) {
+	jcc_count=0;
+	for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from)
+	    if (((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
+		(!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
+	      jcc_count++;
+
+	if (jcc_count>0) {    
+	    if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
+		/* no need to switch buffers, as position is the same */
+		fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
+		fprint_fcost(fp, currCost, last);
+	    }
+	    get_debug_pos(bbcc, bb_addr(bb)+instr_info->instr_offset, &(currCost->p));
+	    fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
+	    something_written = True;
+
+        /* Sort jcc_list by creation sequence to ensure chronological order */
+        bbcc->jmp[jmp].jcc_list = sort_jcc_list(bbcc->jmp[jmp].jcc_list);
+	    for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
+		if (((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
+		    (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
+		    fprint_jcc(fp, jcc, &(currCost->p), last, ecounter);
+	    }
+	}
+    }
+
+    /* update execution counter */
+    if (jmp < bb->cjmp_count)
+	if (bb->jmp[jmp].instr == instr) {
+	    ecounter -= bbcc->jmp[jmp].ecounter;
+	    jmp++;
+	}
+  }
+  
+  /* jCCs at end? If yes, dump cumulated line info first */
+  jcc_count = 0;
+  for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
+      /* yes, if JCC only counts jmp arcs or cost >0 */
+      if ( ((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
+	   (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
+	  jcc_count++;
+  }
+  
+  if ( (bbcc->skipped &&
+	!TG_(is_zero_cost)(TG_(sets).full, bbcc->skipped)) ||
+       (jcc_count>0) ) {
+
+    if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
+      /* no need to switch buffers, as position is the same */
+      fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
+      fprint_fcost(fp, currCost, last);
+    }
+    
+    get_debug_pos(bbcc, bb_jmpaddr(bb), &(currCost->p));
+    fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
+    something_written = True;
+    
+    /* first, print skipped costs for calls */
+    if (bbcc->skipped && !TG_(is_zero_cost)( TG_(sets).full,
+					     bbcc->skipped )) {
+      TG_(add_and_zero_cost)( TG_(sets).full,
+			      currCost->cost, bbcc->skipped );
+#if 0
+      VG_(fprintf)(fp, "# Skipped\n");
+#endif
+      fprint_fcost(fp, currCost, last);
+    }
+
+    if (jcc_count > 0) {
+	/* Sort jcc_list by creation sequence to ensure chronological order */
+	bbcc->jmp[jmp].jcc_list = sort_jcc_list(bbcc->jmp[jmp].jcc_list);
+	for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
+	    TG_ASSERT(jcc->jmp == jmp);
+	    if ( ((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
+		 (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
+
+		fprint_jcc(fp, jcc, &(currCost->p), last, ecounter);
+	}
+    }
+  }
+
+  if (TG_(clo).dump_bbs || TG_(clo).dump_bb) {
+    if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
+      something_written = True;
+
+      fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
+      fprint_fcost(fp, currCost, last);
+    }
+    if (TG_(clo).dump_bbs) VG_(fprintf)(fp, "\n");
+    
+    /* when every cost was immediately written, we must have done so,
+     * as this function is only called when there's cost in a BBCC
+     */
+    TG_ASSERT(something_written);
+  }
+  
+  bbcc->ecounter_sum = 0;
+  for(i=0; i<=bbcc->bb->cjmp_count; i++)
+    bbcc->jmp[i].ecounter = 0;
+  bbcc->ret_counter = 0;
+  
+  TG_DEBUG(1, "- fprint_bbcc: JCCs %d\n", jcc_count);
+  
+  return something_written;
+}
+
+/* order by
+ *  recursion,
+ *  from->bb->obj, from->bb->fn
+ *  obj, fn[0]->file, fn
+ *  address
+ */
+static int my_cmp(BBCC** pbbcc1, BBCC** pbbcc2)
+{
+#if 0
+    return (*pbbcc1)->bb->offset - (*pbbcc2)->bb->offset;
+#else
+    BBCC *bbcc1 = *pbbcc1;
+    BBCC *bbcc2 = *pbbcc2;
+    Context* cxt1 = bbcc1->cxt;
+    Context* cxt2 = bbcc2->cxt;
+    int off = 1;
+
+    if (cxt1->fn[0]->file->obj != cxt2->fn[0]->file->obj)
+	return cxt1->fn[0]->file->obj - cxt2->fn[0]->file->obj;
+
+    if (cxt1->fn[0]->file != cxt2->fn[0]->file)
+	return cxt1->fn[0]->file - cxt2->fn[0]->file;
+
+    if (cxt1->fn[0] != cxt2->fn[0])
+	return cxt1->fn[0] - cxt2->fn[0];
+
+    if (bbcc1->rec_index != bbcc2->rec_index)
+	return bbcc1->rec_index - bbcc2->rec_index;
+
+    while((off < cxt1->size) && (off < cxt2->size)) {
+	fn_node* ffn1 = cxt1->fn[off];
+	fn_node* ffn2 = cxt2->fn[off];
+	if (ffn1->file->obj != ffn2->file->obj)
+	    return ffn1->file->obj - ffn2->file->obj;
+	if (ffn1 != ffn2)
+	    return ffn1 - ffn2;
+	off++;
+    }
+    if      (cxt1->size > cxt2->size) return 1;
+    else if (cxt1->size < cxt2->size) return -1;
+
+    return bbcc1->bb->offset - bbcc2->bb->offset;
+#endif
+}
+
+
+
+
+
+/* modified version of:
+ *
+ * qsort -- qsort interface implemented by faster quicksort.
+ * J. L. Bentley and M. D. McIlroy, SPE 23 (1993) 1249-1265.
+ * Copyright 1993, John Wiley.
+*/
+
+static __inline__
+void swap(BBCC** a, BBCC** b)
+{
+    BBCC* t;
+    t = *a; *a = *b; *b = t;
+}
+
+#if !defined(min)
+#define min(x, y) ((x)<=(y) ? (x) : (y))
+#endif
+
+static
+BBCC** med3(BBCC **a, BBCC **b, BBCC **c, int (*cmp)(BBCC**,BBCC**))
+{	return cmp(a, b) < 0 ?
+		  (cmp(b, c) < 0 ? b : cmp(a, c) < 0 ? c : a)
+		: (cmp(b, c) > 0 ? b : cmp(a, c) > 0 ? c : a);
+}
+
+static BBCC** qsort_start = 0;
+
+static void TG_(qsort)(BBCC **a, int n, int (*cmp)(BBCC**,BBCC**))
+{
+	BBCC **pa, **pb, **pc, **pd, **pl, **pm, **pn, **pv;
+	int s, r;
+	BBCC* v;
+
+	TG_DEBUG(8, "  qsort(%ld,%ld)\n", a-qsort_start + 0L, n + 0L);
+
+	if (n < 7) {	 /* Insertion sort on smallest arrays */
+		for (pm = a+1; pm < a+n; pm++)
+			for (pl = pm; pl > a && cmp(pl-1, pl) > 0; pl --)
+				swap(pl, pl-1);
+
+		TG_DEBUGIF(8) {
+		    for (pm = a; pm < a+n; pm++) {
+			VG_(printf)("   %3ld BB %#lx, ",
+                                    pm - qsort_start + 0L,
+				    bb_addr((*pm)->bb));      
+			TG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
+		    }
+		}
+		return;
+	}
+	pm = a + n/2;    /* Small arrays, middle element */
+	if (n > 7) {
+		pl = a;
+		pn = a + (n-1);
+		if (n > 40) {    /* Big arrays, pseudomedian of 9 */
+			s = n/8;
+			pl = med3(pl, pl+s, pl+2*s, cmp);
+			pm = med3(pm-s, pm, pm+s, cmp);
+			pn = med3(pn-2*s, pn-s, pn, cmp);
+		}
+		pm = med3(pl, pm, pn, cmp); /* Mid-size, med of 3 */
+	}
+
+
+	v = *pm;
+	pv = &v;
+	pa = pb = a;
+	pc = pd = a + (n-1);
+	for (;;) {
+		while ((pb <= pc) && ((r=cmp(pb, pv)) <= 0)) {
+		    if (r==0) {
+			/* same as pivot, to start */
+			swap(pa,pb); pa++; 
+		    }
+		    pb ++;
+		}
+		while ((pb <= pc) && ((r=cmp(pc, pv)) >= 0)) {
+		    if (r==0) {
+			/* same as pivot, to end */
+			swap(pc,pd); pd--; 
+		    }
+		    pc --;
+		}
+		if (pb > pc) { break; }
+		swap(pb, pc);
+		pb ++;
+		pc --;
+	}
+	pb--;
+	pc++;
+
+	/* put pivot from start into middle */
+	if ((s = pa-a)>0) { for(r=0;r<s;r++) swap(a+r, pb+1-s+r); }
+	/* put pivot from end into middle */
+	if ((s = a+n-1-pd)>0) { for(r=0;r<s;r++) swap(pc+r, a+n-s+r); }	    
+
+	TG_DEBUGIF(8) {
+	  VG_(printf)("   PV BB %#lx, ", bb_addr((*pv)->bb));
+	    TG_(print_cxt)(9, (*pv)->cxt, (*pv)->rec_index);
+
+	    s = pb-pa+1;
+	    VG_(printf)("    Lower %ld - %ld:\n",
+                        a-qsort_start + 0L,
+                        a+s-1-qsort_start + 0L);
+	    for (r=0;r<s;r++) {
+		pm = a+r;
+		VG_(printf)("     %3ld BB %#lx, ",
+			    pm-qsort_start + 0L,
+                            bb_addr((*pm)->bb));
+		TG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
+	    }
+
+	    s = pd-pc+1;
+	    VG_(printf)("    Upper %ld - %ld:\n",
+			a+n-s-qsort_start + 0L,
+                        a+n-1-qsort_start + 0L);
+	    for (r=0;r<s;r++) {
+		pm = a+n-s+r;
+		VG_(printf)("     %3ld BB %#lx, ",
+			    pm-qsort_start + 0L,
+                            bb_addr((*pm)->bb));
+		TG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
+	    }
+	}
+
+	if ((s = pb+1-pa) > 1) TG_(qsort)(a,     s, cmp);
+	if ((s = pd+1-pc) > 1) TG_(qsort)(a+n-s, s, cmp);
+}
+
+
+/* Helpers for prepare_dump */
+
+static Int    prepare_count;
+static BBCC** prepare_ptr;
+
+
+static void hash_addCount(BBCC* bbcc)
+{
+  if ((bbcc->ecounter_sum > 0) || (bbcc->ret_counter>0))
+    prepare_count++;
+}
+
+static void hash_addPtr(BBCC* bbcc)
+{
+  if ((bbcc->ecounter_sum == 0) &&
+      (bbcc->ret_counter == 0)) return;
+
+  *prepare_ptr = bbcc;
+  prepare_ptr++;
+}
+
+
+static void cs_addCount(thread_info* ti)
+{
+  Int i;
+  BBCC* bbcc;
+
+  /* add BBCCs with active call in call stack of current thread.
+   * update cost sums for active calls
+   */
+      
+  for(i = 0; i < TG_(current_call_stack).sp; i++) {
+    call_entry* e = &(TG_(current_call_stack).entry[i]);
+    if (e->jcc == 0) continue;
+    
+    TG_(add_diff_cost_lz)( TG_(sets).full, &(e->jcc->cost),
+			   e->enter_cost, TG_(current_state).cost);
+    bbcc = e->jcc->from;
+
+    TG_DEBUG(1, " [%2d] (tid %u), added active: %s\n",
+	     i,TG_(current_tid),bbcc->cxt->fn[0]->name);
+    
+    if (bbcc->ecounter_sum>0 || bbcc->ret_counter>0) {
+      /* already counted */
+      continue;
+    }
+    prepare_count++;
+  }
+}
+
+static void cs_addPtr(thread_info* ti)
+{
+  Int i;
+  BBCC* bbcc;
+
+  /* add BBCCs with active call in call stack of current thread.
+   * update cost sums for active calls
+   */
+      
+  for(i = 0; i < TG_(current_call_stack).sp; i++) {
+    call_entry* e = &(TG_(current_call_stack).entry[i]);
+    if (e->jcc == 0) continue;
+
+    bbcc = e->jcc->from;
+    
+    if (bbcc->ecounter_sum>0 || bbcc->ret_counter>0) {
+      /* already counted */
+      continue;
+    }
+
+    *prepare_ptr = bbcc;
+    prepare_ptr++;
+  }
+}
+
+
+/**
+ * Put all BBCCs with costs into a sorted array.
+ * The returned arrays ends with a null pointer. 
+ * Must be freed after dumping.
+ */
+static
+BBCC** prepare_dump(void)
+{
+    BBCC **array;
+
+    prepare_count = 0;
+    
+    /* if we do not separate among threads, this gives all */
+    /* count number of BBCCs with >0 executions */
+    TG_(forall_bbccs)(hash_addCount);
+
+    /* even if we do not separate among threads,
+     * call stacks are separated */
+    if (TG_(clo).separate_threads)
+      cs_addCount(0);
+    else
+      TG_(forall_threads)(cs_addCount);
+
+    TG_DEBUG(0, "prepare_dump: %d BBCCs\n", prepare_count);
+
+    /* allocate bbcc array, insert BBCCs and sort */
+    prepare_ptr = array =
+      (BBCC**) TG_MALLOC("cl.dump.pd.1",
+                          (prepare_count+1) * sizeof(BBCC*));    
+
+    TG_(forall_bbccs)(hash_addPtr);
+
+    if (TG_(clo).separate_threads)
+      cs_addPtr(0);
+    else
+      TG_(forall_threads)(cs_addPtr);
+
+    TG_ASSERT(array + prepare_count == prepare_ptr);
+
+    /* end mark */
+    *prepare_ptr = 0;
+
+    TG_DEBUG(0,"             BBCCs inserted\n");
+
+    qsort_start = array;
+    TG_(qsort)(array, prepare_count, my_cmp);
+
+    TG_DEBUG(0,"             BBCCs sorted\n");
+
+    return array;
+}
+
+
+
+
+static void fprint_cost_ln(VgFile *fp, const HChar* prefix,
+			   const EventMapping* em, const ULong* cost)
+{
+    HChar *mcost = TG_(mappingcost_as_string)(em, cost);
+    VG_(fprintf)(fp, "%s%s\n", prefix, mcost);
+    TG_FREE(mcost);
+}
+
+static ULong bbs_done = 0;
+static HChar* filename = 0;
+
+static
+void file_err(void)
+{
+   VG_(message)(Vg_UserMsg,
+                "Error: can not open cache simulation output file `%s'\n",
+                filename );
+   VG_(exit)(1);
+}
+
+/**
+ * Create a new dump file and write header.
+ *
+ * Naming: <TG_(clo).filename_base>.<pid>[.<part>][-<tid>]
+ *         <part> is skipped for final dump (trigger==0)
+ *         <tid>  is skipped for thread 1 with TG_(clo).separate_threads=no
+ *
+ * Returns the file descriptor, and -1 on error (no write permission)
+ */
+static VgFile *new_dumpfile(int tid, const HChar* trigger)
+{
+    Bool appending = False;
+    int i;
+    FullCost sum = 0;
+    VgFile *fp;
+
+    TG_ASSERT(dumps_initialized);
+    TG_ASSERT(filename != 0);
+
+    if (!TG_(clo).combine_dumps) {
+	i = VG_(sprintf)(filename, "%s", out_file);
+    
+	if (trigger)
+	    i += VG_(sprintf)(filename+i, ".%d", out_counter);
+
+	if (TG_(clo).separate_threads)
+	    VG_(sprintf)(filename+i, "-%02d", tid);
+
+	fp = VG_(fopen)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
+    }
+    else {
+	VG_(sprintf)(filename, "%s", out_file);
+        fp = VG_(fopen)(filename, VKI_O_WRONLY|VKI_O_APPEND, 0);
+	if (fp && out_counter>1)
+	    appending = True;
+    }
+
+    if (fp == NULL) {
+	fp = VG_(fopen)(filename, VKI_O_CREAT|VKI_O_WRONLY,
+                        VKI_S_IRUSR|VKI_S_IWUSR);
+	if (fp == NULL) {
+	    /* If the file can not be opened for whatever reason (conflict
+	       between multiple supervised processes?), give up now. */
+	    file_err();
+	}
+    }
+
+    TG_DEBUG(2, "  new_dumpfile '%s'\n", filename);
+
+    if (!appending)
+	reset_dump_array();
+
+
+    if (!appending) {
+	/* callgrind format specification, has to be on 1st line */
+	VG_(fprintf)(fp, "# callgrind format\n");
+
+	/* version */
+	VG_(fprintf)(fp, "version: 1\n");
+
+	/* creator */
+	VG_(fprintf)(fp, "creator: tracegrind-" VERSION "\n");
+
+	/* "pid:" line */
+	VG_(fprintf)(fp, "pid: %d\n", VG_(getpid)());
+
+	/* "cmd:" line */
+	VG_(fprintf)(fp, "cmd: %s", cmdbuf);
+    }
+
+    VG_(fprintf)(fp, "\npart: %d\n", out_counter);
+    if (TG_(clo).separate_threads) {
+	VG_(fprintf)(fp, "thread: %d\n", tid);
+    }
+
+    /* "desc:" lines */
+    if (!appending) {
+        VG_(fprintf)(fp, "\n");
+
+#if 0
+	/* Global options changing the tracing behaviour */
+	VG_(fprintf)(fp, "\ndesc: Option: --skip-plt=%s\n",
+		     TG_(clo).skip_plt ? "yes" : "no");
+	VG_(fprintf)(fp, "desc: Option: --collect-jumps=%s\n",
+		     TG_(clo).collect_jumps ? "yes" : "no");
+	VG_(fprintf)(fp, "desc: Option: --separate-recs=%d\n",
+		     TG_(clo).separate_recursions);
+	VG_(fprintf)(fp, "desc: Option: --separate-callers=%d\n",
+		     TG_(clo).separate_callers);
+
+	VG_(fprintf)(fp, "desc: Option: --dump-bbs=%s\n",
+		     TG_(clo).dump_bbs ? "yes" : "no");
+	VG_(fprintf)(fp, "desc: Option: --separate-threads=%s\n",
+		     TG_(clo).separate_threads ? "yes" : "no");
+#endif
+
+	(*TG_(cachesim).dump_desc)(fp);
+    }
+
+    VG_(fprintf)(fp, "\ndesc: Timerange: Basic block %llu - %llu\n",
+		 bbs_done, TG_(stat).bb_executions);
+
+    VG_(fprintf)(fp, "desc: Trigger: %s\n",
+		 trigger ? trigger : "Program termination");
+
+#if 0
+   /* Output function specific config
+    * FIXME */
+   for (i = 0; i < N_FNCONFIG_ENTRIES; i++) {
+       fnc = fnc_table[i];
+       while (fnc) {
+	   if (fnc->skip) {
+	       VG_(fprintf)(fp, "desc: Option: --fn-skip=%s\n", fnc->name);
+	   }
+	   if (fnc->dump_at_enter) {
+	       VG_(fprintf)(fp, "desc: Option: --fn-dump-at-enter=%s\n",
+			    fnc->name);
+	   }   
+	   if (fnc->dump_at_leave) {
+	       VG_(fprintf)(fp, "desc: Option: --fn-dump-at-leave=%s\n",
+			    fnc->name);
+	   }
+	   if (fnc->separate_callers != TG_(clo).separate_callers) {
+	       VG_(fprintf)(fp, "desc: Option: --separate-callers%d=%s\n",
+			    fnc->separate_callers, fnc->name);
+	   }   
+	   if (fnc->separate_recursions != TG_(clo).separate_recursions) {
+	       VG_(fprintf)(fp, "desc: Option: --separate-recs%d=%s\n",
+			    fnc->separate_recursions, fnc->name);
+	   }   
+	   fnc = fnc->next;
+       }
+   }
+#endif
+
+   /* "positions:" line */
+   VG_(fprintf)(fp, "\npositions:%s%s%s\n",
+		TG_(clo).dump_instr ? " instr" : "",
+		TG_(clo).dump_bb    ? " bb" : "",
+		TG_(clo).dump_line  ? " line" : "");
+
+  /* Some (optional) "event:" lines, giving long names to events. */
+   switch (TG_(clo).collect_systime) {
+     case systime_no: break;
+     case systime_msec:
+        VG_(fprintf)(fp, "event: sysTime : sysTime (elapsed ms)\n");
+        break;
+     case systime_usec:
+        VG_(fprintf)(fp, "event: sysTime : sysTime (elapsed us)\n");
+        break;
+     case systime_nsec:
+        VG_(fprintf)(fp, "event: sysTime : sysTime (elapsed ns)\n");
+        VG_(fprintf)(fp, "event: sysCpuTime : sysCpuTime (system cpu ns)\n");
+        break;
+     default:
+        tl_assert(0);
+   }
+
+   /* "events:" line
+      Note: callgrind_annotate expects the "events:" line to be the last line
+      of the PartData.  In other words, this line is before the first line
+      of the PartData body. */
+   HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+   VG_(fprintf)(fp, "events: %s\n", evmap);
+   VG_(free)(evmap);
+
+   /* summary lines */
+   sum = TG_(get_eventset_cost)( TG_(sets).full );
+   TG_(zero_cost)(TG_(sets).full, sum);
+   if (TG_(clo).separate_threads) {
+     thread_info* ti = TG_(get_current_thread)();
+     TG_(add_diff_cost)(TG_(sets).full, sum, ti->lastdump_cost,
+			   ti->states.entry[0]->cost);
+   }
+   else {
+     /* This function is called once for thread 1, where
+      * all costs are summed up when not dumping separate per thread.
+      * But this is not true for summary: we need to add all threads.
+      */
+     int t;
+     thread_info** thr = TG_(get_threads)();
+     for(t=1;t<VG_N_THREADS;t++) {
+       if (!thr[t]) continue;
+       TG_(add_diff_cost)(TG_(sets).full, sum,
+			  thr[t]->lastdump_cost,
+			  thr[t]->states.entry[0]->cost);
+     }
+   }
+   fprint_cost_ln(fp, "summary: ", TG_(dumpmap), sum);
+
+   /* all dumped cost will be added to total_fcc */
+   TG_(init_cost_lz)( TG_(sets).full, &dump_total_cost );
+
+   VG_(fprintf)(fp, "\n\n");
+
+   if (VG_(clo_verbosity) > 1)
+       VG_(message)(Vg_DebugMsg, "Dump to %s\n", filename);
+
+   return fp;
+}
+
+
+static void close_dumpfile(VgFile *fp)
+{
+    if (fp == NULL) return;
+
+    fprint_cost_ln(fp, "totals: ", TG_(dumpmap),
+		   dump_total_cost);
+    //fprint_fcc_ln(fp, "summary: ", &dump_total_fcc);
+    TG_(add_cost_lz)(TG_(sets).full, 
+		     &TG_(total_cost), dump_total_cost);
+
+    VG_(fclose)(fp);
+
+    if (filename[0] == '.') {
+	if (-1 == VG_(rename) (filename, filename+1)) {
+	    /* Can not rename to correct file name: give out warning */
+	    VG_(message)(Vg_DebugMsg, "Warning: Can not rename .%s to %s\n",
+			 filename, filename);
+       }
+   }
+}
+
+
+/* Helper for print_bbccs */
+
+static const HChar* print_trigger;
+
+static void print_bbccs_of_thread(thread_info* ti)
+{
+  BBCC **p, **array;
+  FnPos lastFnPos;
+  AddrPos lastAPos;
+
+  TG_DEBUG(1, "+ print_bbccs(tid %u)\n", TG_(current_tid));
+
+  VgFile *print_fp = new_dumpfile(TG_(current_tid), print_trigger);
+  if (print_fp == NULL) {
+    TG_DEBUG(1, "- print_bbccs(tid %u): No output...\n", TG_(current_tid));
+    return;
+  }
+
+  p = array = prepare_dump();
+  init_fpos(&lastFnPos);
+  init_apos(&lastAPos, 0, 0, 0);
+
+  while(1) {
+
+    /* on context/function change, print old cost buffer before */
+    if (lastFnPos.cxt && ((*p==0) ||				 
+			 (lastFnPos.cxt != (*p)->cxt) ||
+			 (lastFnPos.rec_index != (*p)->rec_index))) {
+      if (!TG_(is_zero_cost)( TG_(sets).full, ccSum[currSum].cost )) {
+	/* no need to switch buffers, as position is the same */
+	fprint_apos(print_fp, &(ccSum[currSum].p), &lastAPos,
+		    lastFnPos.cxt->fn[0]->file, 0);
+	fprint_fcost(print_fp, &ccSum[currSum], &lastAPos);
+      }
+      
+      if (ccSum[currSum].p.file != lastFnPos.cxt->fn[0]->file) {
+	/* switch back to file of function */
+	print_file(print_fp, "fe=", lastFnPos.cxt->fn[0]->file);
+      }
+      VG_(fprintf)(print_fp, "\n");
+    }
+    
+    if (*p == 0) break;
+    
+    if (print_fn_pos(print_fp, &lastFnPos, *p)) {
+      
+      /* new function */
+      init_apos(&lastAPos, 0, 0, (*p)->cxt->fn[0]->file);
+      init_fcost(&ccSum[0], 0, 0, 0);
+      init_fcost(&ccSum[1], 0, 0, 0);
+      currSum = 0;
+      last_inline_fn = 0;  /* reset inline function tracking */
+    }
+    
+    if (TG_(clo).dump_bbs) {
+	/* FIXME: Specify Object of BB if different to object of fn */
+        int i;
+	ULong ecounter = (*p)->ecounter_sum;
+        VG_(fprintf)(print_fp, "bb=%#lx ", (UWord)(*p)->bb->offset);
+	for(i = 0; i<(*p)->bb->cjmp_count;i++) {
+	    VG_(fprintf)(print_fp, "%u %llu ", 
+				(*p)->bb->jmp[i].instr,
+				ecounter);
+	    ecounter -= (*p)->jmp[i].ecounter;
+	}
+	VG_(fprintf)(print_fp, "%u %llu\n", 
+		     (*p)->bb->instr_count,
+		     ecounter);
+    }
+    
+    fprint_bbcc(print_fp, *p, &lastAPos);
+    
+    p++;
+  }
+
+  close_dumpfile(print_fp);
+  VG_(free)(array);
+  
+  /* set counters of last dump */
+  TG_(copy_cost)( TG_(sets).full, ti->lastdump_cost,
+		  TG_(current_state).cost );
+
+  TG_DEBUG(1, "- print_bbccs(tid %u)\n", TG_(current_tid));
+}
+
+
+static void print_bbccs(const HChar* trigger, Bool only_current_thread)
+{
+  init_dump_array();
+  init_debug_cache();
+
+  print_trigger = trigger;
+
+  if (!TG_(clo).separate_threads) {
+    /* All BBCC/JCC costs is stored for thread 1 */
+    Int orig_tid = TG_(current_tid);
+
+    TG_(switch_thread)(1);
+    print_bbccs_of_thread( TG_(get_current_thread)() );
+    TG_(switch_thread)(orig_tid);
+  }
+  else if (only_current_thread)
+    print_bbccs_of_thread( TG_(get_current_thread)() );
+  else
+    TG_(forall_threads)(print_bbccs_of_thread);
+
+  free_dump_array();
+}
+
+
+void TG_(dump_profile)(const HChar* trigger, Bool only_current_thread)
+{
+   TG_DEBUG(2, "+ dump_profile(Trigger '%s')\n",
+	    trigger ? trigger : "Prg.Term.");
+
+   TG_(init_dumps)();
+
+   if (VG_(clo_verbosity) > 1)
+       VG_(message)(Vg_DebugMsg, "Start dumping at BB %llu (%s)...\n",
+		    TG_(stat).bb_executions,
+		    trigger ? trigger : "Prg.Term.");
+
+   out_counter++;
+
+   print_bbccs(trigger, only_current_thread);
+
+   bbs_done = TG_(stat).bb_executions++;
+
+   if (VG_(clo_verbosity) > 1)
+     VG_(message)(Vg_DebugMsg, "Dumping done.\n");
+}
+
+/* Copy command to cmd buffer. We want to original command line
+ * (can change at runtime)
+ */
+static
+void init_cmdbuf(void)
+{
+  SizeT size;
+  Int i,j;
+
+  /* Pass #1: How many bytes do we need? */
+  size  = 1;  // leading ' '
+  size += VG_(strlen)( VG_(args_the_exename) );
+  for (i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
+     const HChar *arg = *(HChar**)VG_(indexXA)( VG_(args_for_client), i );
+     size += 1;   // separator ' '
+     // escape NL in arguments to not break dump format
+     for(j=0; arg[j]; j++)
+       switch(arg[j]) {
+       case '\n':
+       case '\\':
+	 size++; // fall through
+       default:
+	 size++;
+       }
+  }
+
+  cmdbuf = TG_MALLOC("cl.dump.ic.1", size + 1);  // +1 for '\0'
+
+  /* Pass #2: Build up the string */
+  size = VG_(sprintf)(cmdbuf, " %s", VG_(args_the_exename));
+
+  for(i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
+     const HChar *arg = * (HChar**) VG_(indexXA)( VG_(args_for_client), i );
+     cmdbuf[size++] = ' ';
+     for(j=0; arg[j]; j++)
+       switch(arg[j]) {
+       case '\n':
+	 cmdbuf[size++] = '\\';
+	 cmdbuf[size++] = 'n';
+	 break;
+       case '\\':
+	 cmdbuf[size++] = '\\';
+	 cmdbuf[size++] = '\\';
+	 break;
+       default:
+	 cmdbuf[size++] = arg[j];
+	 break;
+       }
+  }
+  cmdbuf[size] = '\0';
+}
+
+/*
+ * Set up file names for dump output: <out_file>.
+ * <out_file> is derived from the output format string, which defaults
+ * to "tracegrind.out.%p", where %p is replaced with the PID.
+ * For the final file name, on intermediate dumps a counter is appended,
+ * and further, if separate dumps per thread are requested, the thread ID.
+ *
+ * <out_file> always starts with a full absolute path.
+ * If the output format string represents a relative path, the current
+ * working directory at program start is used.
+ *
+ * This function has to be called every time a profile dump is generated
+ * to be able to react on PID changes.
+ */
+void TG_(init_dumps)(void)
+{
+   SysRes res;
+
+   static int thisPID = 0;
+   int currentPID = VG_(getpid)();
+   if (currentPID == thisPID) {
+       /* already initialized, and no PID change */
+       TG_ASSERT(out_file != 0);
+       return;
+   }
+   thisPID = currentPID;
+   
+   if (!TG_(clo).out_format)
+     TG_(clo).out_format = DEFAULT_OUTFORMAT;
+
+   /* If a file name was already set, clean up before */
+   if (out_file) {
+       VG_(free)(out_file);
+       VG_(free)(filename);
+       out_counter = 0;
+   }
+
+   // Setup output filename.
+   out_file =
+       VG_(expand_file_name)("--tracegrind-out-file", TG_(clo).out_format);
+
+   /* allocate space big enough for final filenames */
+   filename = (HChar*) TG_MALLOC("cl.dump.init_dumps.2",
+                                 VG_(strlen)(out_file)+32);
+       
+   /* Make sure the output base file can be written.
+    * This is used for the dump at program termination.
+    * We stop with an error here if we can not create the
+    * file: This is probably because of missing rights,
+    * and trace parts wouldn't be allowed to be written, too.
+    */ 
+    VG_(strcpy)(filename, out_file);
+    res = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
+    if (sr_isError(res)) { 
+	res = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
+		       VKI_S_IRUSR|VKI_S_IWUSR);
+	if (sr_isError(res)) {
+	    file_err(); 
+	}
+    }
+    if (!sr_isError(res)) VG_(close)( (Int)sr_Res(res) );
+
+    if (!dumps_initialized)
+	init_cmdbuf();
+
+    dumps_initialized = True;
+}
diff --git a/tracegrind/events.c b/tracegrind/events.c
new file mode 100644
index 000000000..3a7820aff
--- /dev/null
+++ b/tracegrind/events.c
@@ -0,0 +1,505 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                     events.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/* This should be 2**MAX_EVENTGROUP_COUNT */
+#define MAX_EVENTSET_COUNT 1024
+
+static EventGroup* eventGroup[MAX_EVENTGROUP_COUNT];
+static EventSet* eventSetTable[MAX_EVENTSET_COUNT];
+static Bool eventSets_initialized = 0;
+
+static
+void initialize_event_sets(void)
+{
+    Int i;
+
+    if (eventSets_initialized) return;
+
+    for(i=0; i< MAX_EVENTGROUP_COUNT; i++)
+	eventGroup[i] = 0;
+
+    for(i=0; i< MAX_EVENTSET_COUNT; i++)
+	eventSetTable[i] = 0; 
+
+    eventSets_initialized = 1;
+ }
+
+static
+EventGroup* new_event_group(int id, int n)
+{
+    EventGroup* eg;
+
+    initialize_event_sets();
+
+    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
+    TG_ASSERT(eventGroup[id]==0);
+
+    eg = (EventGroup*) TG_MALLOC("cl.events.group.1",
+				  sizeof(EventGroup) + n * sizeof(HChar*));
+    eg->size = n;
+    eventGroup[id] = eg;
+    return eg;
+}
+
+EventGroup* TG_(register_event_group) (int id, const HChar* n1)
+{
+    EventGroup* eg = new_event_group(id, 1);
+    eg->name[0] = n1;
+
+    return eg;
+}
+
+EventGroup* TG_(register_event_group2)(int id, const HChar* n1,
+                                        const HChar* n2)
+{
+    EventGroup* eg = new_event_group(id, 2);
+    eg->name[0] = n1;
+    eg->name[1] = n2;
+
+    return eg;
+}
+
+EventGroup* TG_(register_event_group3)(int id, const HChar* n1,
+                                        const HChar* n2, const HChar* n3)
+{
+    EventGroup* eg = new_event_group(id, 3);
+    eg->name[0] = n1;
+    eg->name[1] = n2;
+    eg->name[2] = n3;
+
+    return eg;
+}
+
+EventGroup* TG_(register_event_group4)(int id, const HChar* n1,
+                                        const HChar* n2, const HChar* n3,
+                                        const HChar* n4)
+{
+    EventGroup* eg = new_event_group(id, 4);
+    eg->name[0] = n1;
+    eg->name[1] = n2;
+    eg->name[2] = n3;
+    eg->name[3] = n4;
+
+    return eg;
+}
+
+EventGroup* TG_(get_event_group)(int id)
+{
+    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
+
+    return eventGroup[id];
+}
+
+
+static
+EventSet* eventset_from_mask(UInt mask)
+{
+    EventSet* es;
+    Int i, count, offset;
+
+    if (mask >= MAX_EVENTSET_COUNT) return 0;
+
+    initialize_event_sets();
+    if (eventSetTable[mask]) return eventSetTable[mask];
+
+    es = (EventSet*) TG_MALLOC("cl.events.eventset.1", sizeof(EventSet));
+    es->mask = mask;
+
+    offset = 0;
+    count = 0;
+    for(i=0;i<MAX_EVENTGROUP_COUNT;i++) {
+	es->offset[i] = offset;
+	if ( ((mask & (1u<<i))==0) || (eventGroup[i]==0))
+	    continue;
+
+	offset += eventGroup[i]->size;
+	count++;
+    }
+    es->size = offset;
+    es->count = count;
+
+    eventSetTable[mask] = es;
+    return es;
+}
+
+EventSet* TG_(get_event_set)(Int id)
+{
+    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
+    return eventset_from_mask(1u << id);
+}
+
+EventSet* TG_(get_event_set2)(Int id1, Int id2)
+{
+    TG_ASSERT(id1>=0 && id1<MAX_EVENTGROUP_COUNT);
+    TG_ASSERT(id2>=0 && id2<MAX_EVENTGROUP_COUNT);
+    return eventset_from_mask((1u << id1) | (1u << id2));
+}
+
+EventSet* TG_(add_event_group)(EventSet* es, Int id)
+{
+    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
+    if (!es) es = eventset_from_mask(0);
+    return eventset_from_mask(es->mask | (1u << id));
+}
+
+EventSet* TG_(add_event_group2)(EventSet* es, Int id1, Int id2)
+{
+    TG_ASSERT(id1>=0 && id1<MAX_EVENTGROUP_COUNT);
+    TG_ASSERT(id2>=0 && id2<MAX_EVENTGROUP_COUNT);
+    if (!es) es = eventset_from_mask(0);
+    return eventset_from_mask(es->mask | (1u << id1) | (1u << id2));
+}
+
+EventSet* TG_(add_event_set)(EventSet* es1, EventSet* es2)
+{
+    if (!es1) es1 = eventset_from_mask(0);
+    if (!es2) es2 = eventset_from_mask(0);
+    return eventset_from_mask(es1->mask | es2->mask);
+}
+
+
+/* Get cost array for an event set */
+ULong* TG_(get_eventset_cost)(EventSet* es)
+{
+    return TG_(get_costarray)(es->size);
+}
+
+/* Set all costs of an event set to zero */
+void TG_(init_cost)(EventSet* es, ULong* cost)
+{
+    Int i;
+
+    if (!cost) return;
+
+    for(i=0; i<es->size; i++)
+	cost[i] = 0;
+}
+
+/* Set all costs of an event set to zero */
+void TG_(init_cost_lz)(EventSet* es, ULong** cost)
+{
+    Int i;
+
+    TG_ASSERT(cost != 0);
+    if (!(*cost))
+	*cost = TG_(get_eventset_cost)(es);
+
+    for(i=0; i<es->size; i++)
+	(*cost)[i] = 0;
+}
+
+void TG_(zero_cost)(EventSet* es, ULong* cost)
+{
+    Int i;
+
+    if (!cost) return;
+
+    for(i=0;i<es->size;i++)
+	cost[i] = 0;
+}
+  
+Bool TG_(is_zero_cost)(EventSet* es, ULong* cost)
+{
+    Int i;
+
+    if (!cost) return True;
+
+    for(i=0; i<es->size; i++)
+	if (cost[i] != 0) return False;
+
+    return True;
+}
+
+void TG_(copy_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+    Int i;
+
+    if (!src) {
+	TG_(zero_cost)(es, dst);
+	return;
+    }
+    TG_ASSERT(dst != 0);
+  
+    for(i=0;i<es->size;i++)
+	dst[i] = src[i];
+}
+
+void TG_(copy_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+    Int i;
+    ULong* dst;
+
+    TG_ASSERT(pdst != 0);
+
+    if (!src) {
+	TG_(zero_cost)(es, *pdst);
+	return;
+    }
+    dst = *pdst;
+    if (!dst)
+	dst = *pdst = TG_(get_eventset_cost)(es);
+  
+    for(i=0;i<es->size;i++)
+	dst[i] = src[i];
+}
+
+void TG_(add_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+    Int i;
+
+    if (!src) return;
+    TG_ASSERT(dst != 0);
+
+    for(i=0; i<es->size; i++)
+	dst[i] += src[i];
+}
+
+void TG_(add_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+    Int i;
+    ULong* dst;
+
+    if (!src) return;
+    TG_ASSERT(pdst != 0);
+
+    dst = *pdst;
+    if (!dst) {
+	dst = *pdst = TG_(get_eventset_cost)(es);
+	TG_(copy_cost)(es, dst, src);
+	return;
+    }
+
+    for(i=0; i<es->size; i++)
+	dst[i] += src[i];
+}
+
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool TG_(add_and_zero_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+    Int i;
+    Bool is_nonzero = False;
+
+    TG_ASSERT((es != 0) && (dst != 0));
+    if (!src) return False;
+
+    for(i=0; i<es->size; i++) {
+	if (src[i]==0) continue;
+	dst[i] += src[i];
+	src[i] = 0;
+	is_nonzero = True;
+    }
+
+    return is_nonzero;
+}
+
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool TG_(add_and_zero_cost2)(EventSet* esDst, ULong* dst,
+			      EventSet* esSrc, ULong* src)
+{
+    Int i,j;
+    Bool is_nonzero = False;
+    UInt mask;
+    EventGroup *eg;
+    ULong *egDst, *egSrc;
+
+    TG_ASSERT((esDst != 0) && (dst != 0) && (esSrc != 0));
+    if (!src) return False;
+
+    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
+	if ((esSrc->mask & mask)==0) continue;
+	if (eventGroup[i] ==0) continue;
+
+	/* if src has a subset, dst must have, too */
+	TG_ASSERT((esDst->mask & mask)>0);
+	eg = eventGroup[i];
+	egSrc = src + esSrc->offset[i];
+	egDst = dst + esDst->offset[i];
+	for(j=0; j<eg->size; j++) {
+	    if (egSrc[j]==0) continue;
+	    egDst[j] += egSrc[j];
+	    egSrc[j] = 0;
+	    is_nonzero = True;
+	}
+    }
+
+    return is_nonzero;
+}
+
+
+
+/* Adds difference of new and old to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool TG_(add_diff_cost)(EventSet* es, ULong* dst, ULong* old, ULong* new_cost)
+{
+    Int i;
+    Bool is_nonzero = False;
+
+    TG_ASSERT((es != 0) && (dst != 0));
+    TG_ASSERT(old && new_cost);
+
+    for(i=0; i<es->size; i++) {
+	if (new_cost[i] == old[i]) continue;
+	dst[i] += new_cost[i] - old[i];
+	old[i] = new_cost[i];
+	is_nonzero = True;
+    }
+
+    return is_nonzero;
+}
+
+Bool TG_(add_diff_cost_lz)(EventSet* es, ULong** pdst, ULong* old, ULong* new_cost)
+{
+    Int i;
+    ULong* dst;
+    Bool is_nonzero = False;
+
+    TG_ASSERT((es != 0) && (pdst != 0));
+    TG_ASSERT(old && new_cost);
+
+    dst = *pdst;
+    if (!dst) {
+	dst = *pdst = TG_(get_eventset_cost)(es);
+	TG_(zero_cost)(es, dst);
+    }
+
+    for(i=0; i<es->size; i++) {
+	if (new_cost[i] == old[i]) continue;
+	dst[i] += new_cost[i] - old[i];
+	old[i] = new_cost[i];
+	is_nonzero = True;
+    }
+
+    return is_nonzero;
+}
+
+
+/* Allocate space for an event mapping */
+EventMapping* TG_(get_eventmapping)(EventSet* es)
+{
+    EventMapping* em;
+
+    TG_ASSERT(es != 0);
+
+    em = (EventMapping*) TG_MALLOC("cl.events.geMapping.1",
+				    sizeof(EventMapping) +
+				    sizeof(struct EventMappingEntry) *
+				    es->size);
+    em->capacity = es->size;
+    em->size = 0;
+    em->es = es;
+
+    return em;
+}
+
+void TG_(append_event)(EventMapping* em, const HChar* n)
+{
+    Int i, j, offset = 0;
+    UInt mask;
+    EventGroup* eg;
+
+    TG_ASSERT(em != 0);
+    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
+	if ((em->es->mask & mask)==0) continue;
+	if (eventGroup[i] ==0) continue;
+
+	eg = eventGroup[i];
+	for(j=0; j<eg->size; j++, offset++) {
+	    if (VG_(strcmp)(n, eg->name[j])!=0)
+		    continue;
+
+	    TG_ASSERT(em->capacity > em->size);
+	    em->entry[em->size].group = i;
+	    em->entry[em->size].index = j;
+	    em->entry[em->size].offset = offset;
+	    em->size++;
+	    return;
+	}
+    }
+}
+
+
+/* Returns pointer to dynamically string. The string will be overwritten
+   with each invocation. */
+HChar *TG_(eventmapping_as_string)(const EventMapping* em)
+{
+    Int i;
+    EventGroup* eg;
+
+    TG_ASSERT(em != 0);
+
+    XArray *xa = VG_(newXA)(VG_(malloc), "cl.events.emas", VG_(free),
+                            sizeof(HChar));
+
+    for(i=0; i< em->size; i++) {
+	if (i > 0) {
+           VG_(xaprintf)(xa, "%c", ' ');
+        }
+	eg = eventGroup[em->entry[i].group];
+	TG_ASSERT(eg != 0);
+        VG_(xaprintf)(xa, "%s", eg->name[em->entry[i].index]);
+    }
+    VG_(xaprintf)(xa, "%c", '\0');   // zero terminate the string
+
+    HChar *buf = VG_(strdup)("cl.events.emas", VG_(indexXA)(xa, 0));
+    VG_(deleteXA)(xa);
+
+    return buf;
+}
+
+/* Returns pointer to dynamically allocated string. Caller needs to
+   VG_(free) it. */
+HChar *TG_(mappingcost_as_string)(const EventMapping* em, const ULong* c)
+{
+    Int i, skipped = 0;
+
+    if (!c || em->size==0) return VG_(strdup)("cl.events.mcas", "");
+
+    XArray *xa = VG_(newXA)(VG_(malloc), "cl.events.mcas", VG_(free),
+                            sizeof(HChar));
+
+    /* At least one entry */
+    VG_(xaprintf)(xa, "%llu", c[em->entry[0].offset]);
+
+    for(i=1; i<em->size; i++) {
+	if (c[em->entry[i].offset] == 0) {
+	    skipped++;
+	    continue;
+	}
+	while(skipped>0) {
+            VG_(xaprintf)(xa, " 0");
+	    skipped--;
+	}
+	VG_(xaprintf)(xa, " %llu", c[em->entry[i].offset]);
+    }
+    VG_(xaprintf)(xa, "%c", '\0');   // zero terminate the string
+
+    HChar *buf = VG_(strdup)("cl.events.mas", VG_(indexXA)(xa, 0));
+    VG_(deleteXA)(xa);
+
+    return buf;
+}
diff --git a/tracegrind/events.h b/tracegrind/events.h
new file mode 100644
index 000000000..3be144222
--- /dev/null
+++ b/tracegrind/events.h
@@ -0,0 +1,131 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                     events.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Abstractions for 64-bit cost lists (events.h) */
+
+#ifndef TG_EVENTS
+#define TG_EVENTS
+
+#include "pub_tool_basics.h"
+
+#define TG_(str) VGAPPEND(vgTracegrind_,str)
+
+/* Event groups consist of one or more named event types.
+ * Event sets are constructed from such event groups.
+ *
+ * Event groups have to be registered globally with a unique ID
+ * before they can be used in an event set.
+ * A group can appear at most once in a event set.
+ */
+
+#define MAX_EVENTGROUP_COUNT 10
+
+typedef struct _EventGroup EventGroup;
+struct _EventGroup {
+    Int size;
+    const HChar* name[0];
+};
+
+/* return 0 if event group can not be registered */
+EventGroup* TG_(register_event_group) (int id, const HChar*);
+EventGroup* TG_(register_event_group2)(int id, const HChar*, const HChar*);
+EventGroup* TG_(register_event_group3)(int id, const HChar*, const HChar*,
+                                        const HChar*);
+EventGroup* TG_(register_event_group4)(int id, const HChar*, const HChar*,
+                                        const HChar*, const HChar*);
+EventGroup* TG_(get_event_group)(int id);
+
+/* Event sets are defined by event groups they consist of. */
+
+typedef struct _EventSet EventSet;
+struct _EventSet {
+    /* if subset with ID x is in the set, then bit x is set */
+    UInt mask;
+    Int count;
+    Int size;
+    Int offset[MAX_EVENTGROUP_COUNT];
+ };
+
+/* Same event set is returned when requesting same event groups */
+EventSet* TG_(get_event_set)(Int id);
+EventSet* TG_(get_event_set2)(Int id1, Int id2);
+EventSet* TG_(add_event_group)(EventSet*, Int id);
+EventSet* TG_(add_event_group2)(EventSet*, Int id1, Int id2);
+EventSet* TG_(add_event_set)(EventSet*, EventSet*);
+
+
+/* Operations on costs. A cost pointer of 0 means zero cost.
+ * Functions ending in _lz allocate cost arrays only when needed
+ */
+ULong* TG_(get_eventset_cost)(EventSet*);
+/* Set costs of event set to 0 */
+void TG_(init_cost)(EventSet*,ULong*);
+/* This always allocates counter and sets them to 0 */
+void TG_(init_cost_lz)(EventSet*,ULong**);
+/* Set costs of an event set to zero */
+void TG_(zero_cost)(EventSet*,ULong*);
+Bool TG_(is_zero_cost)(EventSet*,ULong*);
+void TG_(copy_cost)(EventSet*,ULong* dst, ULong* src);
+void TG_(copy_cost_lz)(EventSet*,ULong** pdst, ULong* src);
+void TG_(add_cost)(EventSet*,ULong* dst, ULong* src);
+void TG_(add_cost_lz)(EventSet*,ULong** pdst, ULong* src);
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool TG_(add_and_zero_cost)(EventSet*,ULong* dst, ULong* src);
+Bool TG_(add_and_zero_cost2)(EventSet*,ULong* dst,EventSet*,ULong* src);
+/* Adds difference of new and old to to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool TG_(add_diff_cost)(EventSet*,ULong* dst, ULong* old, ULong* new_cost);
+Bool TG_(add_diff_cost_lz)(EventSet*,ULong** pdst, ULong* old, ULong* new_cost);
+
+/* EventMapping: An ordered subset of events from an event set.
+ * This is used to print out part of an EventSet, or in another order.
+ */
+struct EventMappingEntry {
+    Int group;
+    Int index;
+    Int offset;
+};
+typedef struct _EventMapping EventMapping;
+struct _EventMapping {
+  EventSet* es;
+  Int size;
+  Int capacity;
+  struct EventMappingEntry entry[0];
+};
+
+/* Allocate space for an event mapping */
+EventMapping* TG_(get_eventmapping)(EventSet*);
+void TG_(append_event)(EventMapping*, const HChar*);
+/* Returns event mapping as a character string. That string is dynamically
+   allocated and it is the caller's responsibility to free it.
+   The function never returns NULL. */
+HChar *TG_(eventmapping_as_string)(const EventMapping*);
+/* Returns mapping cost as a character string. That string is dynamically
+   allocated and it is the caller's responsibility to free it.
+   The function never returns NULL. */
+HChar *TG_(mappingcost_as_string)(const EventMapping*, const ULong*);
+
+#endif /* TG_EVENTS */
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
new file mode 100644
index 000000000..36ab8d394
--- /dev/null
+++ b/tracegrind/fn.c
@@ -0,0 +1,787 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                      ct_fn.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#define N_INITIAL_FN_ARRAY_SIZE 10071
+
+static fn_array current_fn_active;
+
+/* x86_64 defines 4 variants.  */
+#define MAX_RESOLVE_ADDRS 4
+static int  runtime_resolve_addrs = 0;
+static Addr runtime_resolve_addr[MAX_RESOLVE_ADDRS];
+static int  runtime_resolve_length[MAX_RESOLVE_ADDRS];
+
+// a code pattern is a list of tuples (start offset, length)
+struct chunk_t { int start, len; };
+struct pattern
+{
+    const HChar* name;
+    int len;
+    struct chunk_t chunk[];
+};
+
+/* Scan for a pattern in the code of an ELF object.
+ * If found, return true and set runtime_resolve_{addr,length}
+ */
+__attribute__((unused))    // Possibly;  depends on the platform.
+static Bool check_code(obj_node* obj,
+                       UChar code[], struct pattern* pat)
+{
+    Bool found;
+    Addr addr, end;
+    int chunk, start, len;
+
+    /* first chunk of pattern should always start at offset 0 and
+     * have at least 3 bytes */
+    TG_ASSERT((pat->chunk[0].start == 0) && (pat->chunk[0].len >2));
+
+    /* and we cannot be called more than MAX_RESOLVE_ADDRS times */
+    TG_ASSERT(runtime_resolve_addrs < MAX_RESOLVE_ADDRS);
+    
+    TG_DEBUG(1, "check_code: %s, pattern %s, check %d bytes of [%x %x %x...]\n",
+              obj->name, pat->name, pat->chunk[0].len, code[0], code[1], code[2]);
+
+    end = obj->start + obj->size - pat->len;
+    addr = obj->start;
+    while(addr < end) {
+	found = (VG_(memcmp)( (void*)addr, code, pat->chunk[0].len) == 0);
+
+        if (found) {
+	    chunk = 1;
+	    while(1) {		
+		start = pat->chunk[chunk].start;
+		len   = pat->chunk[chunk].len;
+		if (len == 0) break;
+
+		TG_ASSERT(len >2);
+                TG_DEBUG(1, " found chunk %d at %#lx, checking %d bytes "
+                             "of [%x %x %x...]\n",
+                          chunk-1, addr - obj->start, len,
+			  code[start], code[start+1], code[start+2]);
+
+                if (VG_(memcmp)( (void*)(addr+start), code+start, len) != 0) {
+                    found = False;
+                    break;
+                }
+		chunk++;
+	    }
+
+            if (found) {
+		TG_DEBUG(1, "found at offset %#lx.\n", addr - obj->start);
+		if (VG_(clo_verbosity) > 1)
+		    VG_(message)(Vg_DebugMsg, "Found runtime_resolve (%s): "
+                                              "%s +%#lx=%#lx, length %d\n",
+				 pat->name, obj->name + obj->last_slash_pos,
+				 addr - obj->start, addr, pat->len);
+		    
+		runtime_resolve_addr[runtime_resolve_addrs] = addr;
+		runtime_resolve_length[runtime_resolve_addrs] = pat->len;
+		runtime_resolve_addrs++;
+		return True;
+	    }
+        }
+        addr++;
+    }
+    TG_DEBUG(1, " found nothing.\n");
+    return False;
+}
+
+
+/* _ld_runtime_resolve, located in ld.so, needs special handling:
+ * The jump at end into the resolved function should not be
+ * represented as a call (as usually done in tracegrind with jumps),
+ * but as a return + call. Otherwise, the repeated existence of
+ * _ld_runtime_resolve in call chains will lead to huge cycles,
+ * making the profile almost worthless.
+ *
+ * If ld.so is stripped, the symbol will not appear. But as this
+ * function is handcrafted assembler, we search for it.
+ *
+ * We stop if the ELF object name does not seem to be the runtime linker
+ */
+static Bool search_runtime_resolve(obj_node* obj)
+{
+#if defined(VGP_x86_linux)
+    static UChar code[] = {
+	/* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+	/* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+	/*16*/ 0x5a, 0x59, 0x87, 0x04, 0x24, 0xc2, 0x08, 0x00 };
+    /* Check ranges [0-11] and [16-23] ([12-15] is an absolute address) */
+    static struct pattern pat = {
+	"x86-def", 24, {{ 0,12 }, { 16,8 }, { 24,0}} };
+
+    /* Pattern for glibc-2.8 on OpenSuse11.0 */
+    static UChar code_28[] = {
+	/* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+	/* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+	/*16*/ 0x5a, 0x8b, 0x0c, 0x24, 0x89, 0x04, 0x24, 0x8b,
+	/*24*/ 0x44, 0x24, 0x04, 0xc2, 0x0c, 0x00 };
+    static struct pattern pat_28 = {
+	"x86-glibc2.8", 30, {{ 0,12 }, { 16,14 }, { 30,0}} };
+
+    if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) return False;
+    Bool pat_p    = check_code(obj, code, &pat);
+    Bool pat_28_p = check_code(obj, code_28, &pat_28);
+    if (pat_p || pat_28_p) return True;
+    return False;
+#endif
+
+#if defined(VGP_ppc32_linux)
+    static UChar code[] = {
+	/* 0*/ 0x94, 0x21, 0xff, 0xc0, 0x90, 0x01, 0x00, 0x0c,
+	/* 8*/ 0x90, 0x61, 0x00, 0x10, 0x90, 0x81, 0x00, 0x14,
+	/*16*/ 0x7d, 0x83, 0x63, 0x78, 0x90, 0xa1, 0x00, 0x18,
+	/*24*/ 0x7d, 0x64, 0x5b, 0x78, 0x90, 0xc1, 0x00, 0x1c,
+	/*32*/ 0x7c, 0x08, 0x02, 0xa6, 0x90, 0xe1, 0x00, 0x20,
+	/*40*/ 0x90, 0x01, 0x00, 0x30, 0x91, 0x01, 0x00, 0x24,
+	/*48*/ 0x7c, 0x00, 0x00, 0x26, 0x91, 0x21, 0x00, 0x28,
+	/*56*/ 0x91, 0x41, 0x00, 0x2c, 0x90, 0x01, 0x00, 0x08,
+	/*64*/ 0x48, 0x00, 0x02, 0x91, 0x7c, 0x69, 0x03, 0xa6, /* at 64: bl aff0 <fixup> */
+	/*72*/ 0x80, 0x01, 0x00, 0x30, 0x81, 0x41, 0x00, 0x2c,
+	/*80*/ 0x81, 0x21, 0x00, 0x28, 0x7c, 0x08, 0x03, 0xa6,
+	/*88*/ 0x81, 0x01, 0x00, 0x24, 0x80, 0x01, 0x00, 0x08,
+	/*96*/ 0x80, 0xe1, 0x00, 0x20, 0x80, 0xc1, 0x00, 0x1c,
+	/*104*/0x7c, 0x0f, 0xf1, 0x20, 0x80, 0xa1, 0x00, 0x18,
+	/*112*/0x80, 0x81, 0x00, 0x14, 0x80, 0x61, 0x00, 0x10,
+	/*120*/0x80, 0x01, 0x00, 0x0c, 0x38, 0x21, 0x00, 0x40,
+	/*128*/0x4e, 0x80, 0x04, 0x20 };
+    static struct pattern pat = {
+	"ppc32-def", 132, {{ 0,65 }, { 68,64 }, { 132,0 }} };
+
+    if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) return False;
+    return check_code(obj, code, &pat);
+#endif
+
+#if defined(VGP_amd64_linux)
+    static UChar code[] = {
+	/* 0*/ 0x48, 0x83, 0xec, 0x38, 0x48, 0x89, 0x04, 0x24,
+	/* 8*/ 0x48, 0x89, 0x4c, 0x24, 0x08, 0x48, 0x89, 0x54, 0x24, 0x10,
+	/*18*/ 0x48, 0x89, 0x74, 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20,
+	/*28*/ 0x4c, 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, 0x30,
+	/*38*/ 0x48, 0x8b, 0x74, 0x24, 0x40, 0x49, 0x89, 0xf3,
+	/*46*/ 0x4c, 0x01, 0xde, 0x4c, 0x01, 0xde, 0x48, 0xc1, 0xe6, 0x03,
+	/*56*/ 0x48, 0x8b, 0x7c, 0x24, 0x38, 0xe8, 0xee, 0x01, 0x00, 0x00,
+	/*66*/ 0x49, 0x89, 0xc3, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+	/*74*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+	/*84*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, 0x24, 0x10,
+	/*94*/ 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48, 0x8b, 0x04, 0x24,
+	/*103*/0x48, 0x83, 0xc4, 0x48, 0x41, 0xff, 0xe3 };
+    static struct pattern pat = {
+	"amd64-def", 110, {{ 0,62 }, { 66,44 }, { 110,0 }} };
+
+    static UChar code_xsavec[] = {
+	/* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
+	/* 8*/ 0x48, 0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp */
+	/*15*/ 0x48,
+	/*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+	/*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+	/*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+	/*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+	/*48*/ 0x30, 0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
+	/*56*/ 0x48, 0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
+	/*64*/ 0x48, 0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
+	/*72*/ 0x48, 0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
+	/*80*/ 0x48, 0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
+	/*88*/ 0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
+	/*96*/ 0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
+	/*04*/ 0x0f, 0xc7, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
+	/*112*/0x10, 0x48, 0x8b, 0x7b, 0x08,
+	/*117*/0xe8, 0x00, 0x00, 0x00, 0x00,		/* callq <_dl_fixup> */
+	/*122*/0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
+	/*128*/0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
+	/*136*/0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
+	/*144*/0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+	/*152*/0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
+	/*160*/0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
+	/*168*/0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
+	/*176*/0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
+	/*184*/0xff, 0xe3 };
+    static struct pattern pat_xsavec = {
+	    "amd64-xsavec", 186, {{ 0,11 }, { 15,103 }, {122,64}, { 186,0 }} };
+
+    static UChar code_xsave[] = {
+	/* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
+	/* 8*/ 0x48, 0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp */
+	/*15*/ 0x48,
+	/*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+	/*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+	/*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+	/*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+	/*48*/ 0x30, 0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
+	/*56*/ 0x48, 0x89, 0x94, 0x24, 0x40, 0x02, 0x00, 0x00,
+	/*64*/ 0x48, 0x89, 0x94, 0x24, 0x48, 0x02, 0x00, 0x00,
+	/*72*/ 0x48, 0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
+	/*80*/ 0x48, 0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
+	/*88*/ 0x48, 0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
+	/*96*/ 0x48, 0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
+	/*104*/0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
+	/*112*/0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
+	/*120*/0x0f, 0xae, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
+	/*128*/0x10, 0x48, 0x8b, 0x7b, 0x08,
+	/*133*/0xe8, 0x00, 0x00, 0x00, 0x00,		/* callq <_dl_fixup> */
+	/*138*/0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
+	/*144*/0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
+	/*152*/0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
+	/*160*/0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+	/*168*/0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
+	/*176*/0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
+	/*184*/0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
+	/*192*/0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
+	/*200*/0xff, 0xe3 };
+    static struct pattern pat_xsave = {
+	"amd64-xsave", 202, {{ 0,11 }, { 15,119 }, {138,64}, { 202,0 }} };
+
+    static UChar code_fxsave[] = {
+	/* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xf0,
+	/* 8*/ 0x48, 0x81, 0xec, 0x40, 0x02, 0x00, 0x00, 0x48,
+	/*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+	/*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+	/*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+	/*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+	/*48*/ 0x30, 0x0f, 0xae, 0x44, 0x24, 0x40, 0x48, 0x8b,
+	/*56*/ 0x73, 0x10, 0x48, 0x8b, 0x7b, 0x08,
+	/*62*/ 0xe8, 0x00, 0x00, 0x00, 0x00,		/* callq <_dl_fixup> */
+	/*67*/ 0x49, 0x89, 0xc3, 0x0f, 0xae,
+	/*72*/ 0x4c, 0x24, 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+	/*80*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c,
+	/*88*/ 0x24, 0x20, 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48,
+	/*96*/ 0x8b, 0x54, 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24,
+	/*104*/0x08, 0x48, 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc,
+	/*112*/0x48, 0x8b, 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18,
+	/*120*/0xf2, 0x41, 0xff, 0xe3 };
+    static struct pattern pat_fxsave = {
+	"amd64-fxsave", 124, {{ 0,63 }, { 67,57 }, { 124,0 }} };
+
+    if ((VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) &&
+	(VG_(strncmp)(obj->name, "/lib64/ld", 9) != 0) &&
+	(VG_(strncmp)(obj->name, "/usr/lib/ld", 11) != 0) &&
+	(VG_(strncmp)(obj->name, "/usr/lib64/ld", 13) != 0)) return False;
+    Bool pat_p        = check_code(obj, code, &pat);
+    Bool pat_xsavec_p = check_code(obj, code_xsavec, &pat_xsavec);
+    Bool pat_xsave_p  = check_code(obj, code_xsave, &pat_xsave);
+    Bool pat_fxsave_p = check_code(obj, code_fxsave, &pat_fxsave);
+    if (pat_p || pat_xsavec_p || pat_xsave_p || pat_fxsave_p) return True;
+#endif
+
+    /* For other platforms, no patterns known */
+    return False;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Object/File/Function hash entry operations           ---*/
+/*------------------------------------------------------------*/
+
+/* Object hash table, fixed */
+static obj_node* obj_table[N_OBJ_ENTRIES];
+
+void TG_(init_obj_table)(void)
+{
+    Int i;
+    for (i = 0; i < N_OBJ_ENTRIES; i++)
+	obj_table[i] = 0;
+}
+
+#define HASH_CONSTANT   256
+
+static UInt str_hash(const HChar *s, UInt table_size)
+{
+    int hash_value = 0;
+    for ( ; *s; s++)
+        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+    return hash_value;
+}
+
+
+static const HChar* anonymous_obj = "???";
+
+static __inline__ 
+obj_node* new_obj_node(DebugInfo* di, obj_node* next)
+{
+   Int i;
+   obj_node* obj;
+
+   obj = (obj_node*) TG_MALLOC("cl.fn.non.1", sizeof(obj_node));
+   obj->name  = di ? VG_(strdup)( "cl.fn.non.2",
+                                  VG_(DebugInfo_get_filename)(di) )
+                   : anonymous_obj;
+   for (i = 0; i < N_FILE_ENTRIES; i++) {
+      obj->files[i] = NULL;
+   }
+   TG_(stat).distinct_objs ++;
+   obj->number  = TG_(stat).distinct_objs;
+   /* JRS 2008 Feb 19: maybe rename .start/.size/.offset to
+      .text_avma/.text_size/.test_bias to make it clearer what these
+      fields really mean */
+   obj->start   = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
+   obj->size    = di ? VG_(DebugInfo_get_text_size)(di) : 0;
+   obj->offset  = di ? VG_(DebugInfo_get_text_bias)(di) : 0;
+   obj->next    = next;
+
+   // not only used for debug output (see static.c)
+   obj->last_slash_pos = 0;
+   i = 0;
+   while(obj->name[i]) {
+	if (obj->name[i]=='/') obj->last_slash_pos = i+1;
+	i++;
+   }
+
+   if (runtime_resolve_addrs == 0) search_runtime_resolve(obj);
+
+   return obj;
+}
+
+obj_node* TG_(get_obj_node)(DebugInfo* di)
+{
+    obj_node*    curr_obj_node;
+    UInt         objname_hash;
+    const HChar* obj_name;
+    
+    obj_name = di ? VG_(DebugInfo_get_filename)(di) : anonymous_obj;
+
+    /* lookup in obj hash */
+    objname_hash = str_hash(obj_name, N_OBJ_ENTRIES);
+    curr_obj_node = obj_table[objname_hash];
+    while (NULL != curr_obj_node && 
+	   VG_(strcmp)(obj_name, curr_obj_node->name) != 0) {
+	curr_obj_node = curr_obj_node->next;
+    }
+    if (NULL == curr_obj_node) {
+	obj_table[objname_hash] = curr_obj_node = 
+	    new_obj_node(di, obj_table[objname_hash]);
+    }
+
+    return curr_obj_node;
+}
+
+
+static __inline__ 
+file_node* new_file_node(const HChar *filename,
+			 obj_node* obj, file_node* next)
+{
+  Int i;
+  file_node* file = (file_node*) TG_MALLOC("cl.fn.nfn.1",
+                                           sizeof(file_node));
+  file->name  = VG_(strdup)("cl.fn.nfn.2", filename);
+  for (i = 0; i < N_FN_ENTRIES; i++) {
+    file->fns[i] = NULL;
+  }
+  TG_(stat).distinct_files++;
+  file->number  = TG_(stat).distinct_files;
+  file->obj     = obj;
+  file->next      = next;
+  return file;
+}
+
+ 
+file_node* TG_(get_file_node)(obj_node* curr_obj_node,
+                               const HChar *dir, const HChar *file)
+{
+    file_node* curr_file_node;
+    UInt       filename_hash;
+
+    /* Build up an absolute pathname, if there is a directory available */
+    HChar filename[VG_(strlen)(dir) + 1 + VG_(strlen)(file) + 1];
+    VG_(strcpy)(filename, dir);
+    if (filename[0] != '\0') {
+       VG_(strcat)(filename, "/");
+    }
+    VG_(strcat)(filename, file);
+
+    /* lookup in file hash */
+    filename_hash = str_hash(filename, N_FILE_ENTRIES);
+    curr_file_node = curr_obj_node->files[filename_hash];
+    while (NULL != curr_file_node && 
+	   VG_(strcmp)(filename, curr_file_node->name) != 0) {
+	curr_file_node = curr_file_node->next;
+    }
+    if (NULL == curr_file_node) {
+	curr_obj_node->files[filename_hash] = curr_file_node = 
+	    new_file_node(filename, curr_obj_node, 
+			  curr_obj_node->files[filename_hash]);
+    }
+
+    return curr_file_node;
+}
+
+/* forward decl. */
+static void resize_fn_array(void);
+
+static __inline__ 
+fn_node* new_fn_node(const HChar *fnname,
+		     file_node* file, fn_node* next)
+{
+    fn_node* fn = (fn_node*) TG_MALLOC("cl.fn.nfnnd.1",
+                                         sizeof(fn_node));
+    fn->name = VG_(strdup)("cl.fn.nfnnd.2", fnname);
+
+    TG_(stat).distinct_fns++;
+    fn->number   = TG_(stat).distinct_fns;
+    fn->last_cxt = 0;
+    fn->pure_cxt = 0;
+    fn->file     = file;
+    fn->next     = next;
+
+    fn->dump_before  = False;
+    fn->dump_after   = False;
+    fn->zero_before  = False;
+    fn->toggle_collect = False;
+    fn->skip         = False;
+    fn->obj_skip_checked = False;
+    fn->pop_on_jump  = TG_(clo).pop_on_jump;
+    fn->is_malloc    = False;
+    fn->is_realloc   = False;
+    fn->is_free      = False;
+
+    fn->group        = 0;
+    fn->separate_callers    = TG_(clo).separate_callers;
+    fn->separate_recursions = TG_(clo).separate_recursions;
+
+#if TG_ENABLE_DEBUG
+    fn->verbosity    = -1;
+#endif
+
+    if (TG_(stat).distinct_fns >= current_fn_active.size)
+	resize_fn_array();
+
+    return fn;
+}
+
+
+/* Get a function node in hash2 with known file node.
+ * hash nodes are created if needed
+ */
+static
+fn_node* get_fn_node_infile(file_node* curr_file_node,
+			    const HChar *fnname)
+{
+    fn_node* curr_fn_node;
+    UInt     fnname_hash;
+
+    TG_ASSERT(curr_file_node != 0);
+
+    /* lookup in function hash */
+    fnname_hash = str_hash(fnname, N_FN_ENTRIES);
+    curr_fn_node = curr_file_node->fns[fnname_hash];
+    while (NULL != curr_fn_node && 
+	   VG_(strcmp)(fnname, curr_fn_node->name) != 0) {
+	curr_fn_node = curr_fn_node->next;
+    }
+    if (NULL == curr_fn_node) {
+	curr_file_node->fns[fnname_hash] = curr_fn_node = 
+            new_fn_node(fnname, curr_file_node,
+			curr_file_node->fns[fnname_hash]);
+    }
+
+    return curr_fn_node;
+}
+
+
+/* Get a function node in a Segment.
+ * Hash nodes are created if needed.
+ */
+static __inline__
+fn_node* get_fn_node_inseg(DebugInfo* di,
+			   const HChar *dirname,
+			   const HChar *filename,
+			   const HChar *fnname)
+{
+  obj_node  *obj  = TG_(get_obj_node)(di);
+  file_node *file = TG_(get_file_node)(obj, dirname, filename);
+  fn_node   *fn   = get_fn_node_infile(file, fnname);
+
+  return fn;
+}
+
+
+Bool TG_(get_debug_info)(Addr instr_addr,
+                          const HChar **dir,
+                          const HChar **file,
+                          const HChar **fn_name, UInt* line_num,
+                          DebugInfo** pDebugInfo)
+{
+  Bool found_file_line, found_fn, result = True;
+  UInt line;
+  
+  TG_DEBUG(6, "  + get_debug_info(%#lx)\n", instr_addr);
+
+  DiEpoch ep = VG_(current_DiEpoch)();
+  if (pDebugInfo) {
+      *pDebugInfo = VG_(find_DebugInfo)(ep, instr_addr);
+
+      // for generated code in anonymous space, pSegInfo is 0
+   }
+
+   found_file_line = VG_(get_filename_linenum)(ep, instr_addr,
+					       file,
+					       dir,
+					       &line);
+   found_fn = VG_(get_fnname)(ep, instr_addr, fn_name);
+
+   if (!found_file_line && !found_fn) {
+     TG_(stat).no_debug_BBs++;
+     *file = "???";
+     *fn_name = "???";
+     if (line_num) *line_num=0;
+     result = False;
+
+   } else if ( found_file_line &&  found_fn) {
+     TG_(stat).full_debug_BBs++;
+     if (line_num) *line_num=line;
+
+   } else if ( found_file_line && !found_fn) {
+     TG_(stat).file_line_debug_BBs++;
+     *fn_name = "???";
+     if (line_num) *line_num=line;
+
+   } else  /*(!found_file_line &&  found_fn)*/ {
+     TG_(stat).fn_name_debug_BBs++;
+     *file = "???";
+     if (line_num) *line_num=0;
+   }
+
+   TG_DEBUG(6, "  - get_debug_info(%#lx): seg '%s', fn %s\n",
+	    instr_addr,
+	    !pDebugInfo   ? "-" :
+	    (*pDebugInfo) ? VG_(DebugInfo_get_filename)(*pDebugInfo) :
+	    "(None)",
+	    *fn_name);
+
+  return result;
+}
+
+/* for _libc_freeres_wrapper => _exit renaming */
+static BB* exit_bb = 0;
+
+
+/*
+ * Attach function struct to a BB from debug info.
+ */
+fn_node* TG_(get_fn_node)(BB* bb)
+{
+    const HChar *fnname, *filename, *dirname;
+    DebugInfo* di;
+    UInt       line_num;
+    fn_node*   fn;
+    Int        i;
+
+    /* fn from debug info is idempotent for a BB */
+    if (bb->fn) return bb->fn;
+
+    TG_DEBUG(3,"+ get_fn_node(BB %#lx)\n", bb_addr(bb));
+
+    /* get function/file name, line number and object of
+     * the BB according to debug information
+     */
+    TG_(get_debug_info)(bb_addr(bb),
+                         &dirname, &filename, &fnname, &line_num, &di);
+
+    DiEpoch ep = VG_(current_DiEpoch)();
+    if (0 == VG_(strcmp)(fnname, "???")) {
+	int p;
+        static HChar buf[32];  // for sure large enough
+	/* Use address as found in library */
+	if (sizeof(Addr) == 4)
+          p = VG_(sprintf)(buf, "%#08lx", (UWord)bb->offset);
+	else 	    
+	    // 64bit address
+          p = VG_(sprintf)(buf, "%#016lx", (UWord)bb->offset);
+
+	VG_(sprintf)(buf + p, "%s", 
+		     (bb->sect_kind == Vg_SectData) ? " [Data]" :
+		     (bb->sect_kind == Vg_SectBSS)  ? " [BSS]"  :
+		     (bb->sect_kind == Vg_SectGOT)  ? " [GOT]"  :
+		     (bb->sect_kind == Vg_SectPLT)  ? " [PLT]"  : "");
+        fnname = buf;
+    }
+    else {
+      if (VG_(get_fnname_if_entry)(ep, bb_addr(bb), &fnname))
+	bb->is_entry = 1;
+    }
+
+    /* HACK for correct _exit: 
+     * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind,
+     * so we rename it back again :-)
+     */
+    if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper")
+	&& exit_bb) {
+      TG_(get_debug_info)(bb_addr(exit_bb),
+                           &dirname, &filename, &fnname, &line_num, &di);
+	
+	TG_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n");
+    }
+    if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb)
+	exit_bb = bb;
+
+    for (i = 0; i < runtime_resolve_addrs; i++) {
+      if ((bb_addr(bb) >= runtime_resolve_addr[i]) &&
+	  (bb_addr(bb) < runtime_resolve_addr[i] + runtime_resolve_length[i])) {
+	  /* BB in runtime_resolve found by code check; use this name */
+	  fnname = "_dl_runtime_resolve";
+	  break;
+      }
+    }
+
+    /* get fn_node struct for this function */
+    fn = get_fn_node_inseg( di, dirname, filename, fnname);
+
+    /* if this is the 1st time the function is seen,
+     * some attributes are set */
+    if (fn->pure_cxt == 0) {
+
+      /* Every function gets a "pure" context, i.e. a context with stack
+       * depth 1 only with this function. This is for compression of mangled
+       * names
+       */
+      fn_node* pure[2];
+      pure[0] = 0;
+      pure[1] = fn;
+      fn->pure_cxt = TG_(get_cxt)(pure+1);
+
+      if (bb->sect_kind == Vg_SectPLT || bb->sect_kind == Vg_SectPLTSEC)	
+	fn->skip = TG_(clo).skip_plt;
+
+      if (VG_(strncmp)(fn->name, "_dl_runtime_resolve", 19)==0) {
+	  fn->pop_on_jump = True;
+
+	  if (VG_(clo_verbosity) > 1)
+	      VG_(message)(Vg_DebugMsg, "Symbol match: found runtime_resolve:"
+                                        " %s +%#lx=%#lx\n",
+		      bb->obj->name + bb->obj->last_slash_pos,
+                      (UWord)bb->offset, bb_addr(bb));
+      }
+
+      fn->is_malloc  = (VG_(strcmp)(fn->name, "malloc")==0);
+      fn->is_realloc = (VG_(strcmp)(fn->name, "realloc")==0);
+      fn->is_free    = (VG_(strcmp)(fn->name, "free")==0);
+
+      /* apply config options from function name patterns
+       * given on command line */
+      TG_(update_fn_config)(fn);
+    }
+
+
+    bb->fn   = fn;
+    bb->line = line_num;
+
+    if (dirname[0]) {
+       TG_DEBUG(3,"- get_fn_node(BB %#lx): %s (in %s:%u)\n",
+                 bb_addr(bb), fnname, filename, line_num);
+    } else
+       TG_DEBUG(3,"- get_fn_node(BB %#lx): %s (in %s/%s:%u)\n",
+                 bb_addr(bb), fnname, dirname, filename, line_num);
+
+    return fn;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Active function array operations                     ---*/
+/*------------------------------------------------------------*/
+
+/* The active function array is a thread-specific array
+ * of UInts, mapping function numbers to the active count of
+ * functions.
+ * The active count is the number of times a function appears
+ * in the current call stack, and is used when costs for recursion
+ * levels should be separated.
+ */
+
+UInt* TG_(get_fn_entry)(Int n)
+{
+  TG_ASSERT(n < current_fn_active.size);
+  return current_fn_active.array + n;
+}
+
+void TG_(init_fn_array)(fn_array* a)
+{
+  Int i;
+
+  TG_ASSERT(a != 0);
+
+  a->size = N_INITIAL_FN_ARRAY_SIZE;
+  if (a->size <= TG_(stat).distinct_fns)
+    a->size = TG_(stat).distinct_fns+1;
+  
+  a->array = (UInt*) TG_MALLOC("cl.fn.gfe.1",
+                                a->size * sizeof(UInt));
+  for(i=0;i<a->size;i++)
+    a->array[i] = 0;
+}
+
+void TG_(copy_current_fn_array)(fn_array* dst)
+{
+  TG_ASSERT(dst != 0);
+
+  dst->size  = current_fn_active.size;
+  dst->array = current_fn_active.array;
+}
+
+fn_array* TG_(get_current_fn_array)(void)
+{
+  return &current_fn_active;
+}
+
+void TG_(set_current_fn_array)(fn_array* a)
+{
+  TG_ASSERT(a != 0);
+
+  current_fn_active.size  = a->size;
+  current_fn_active.array = a->array;
+  if (current_fn_active.size <= TG_(stat).distinct_fns)
+    resize_fn_array();
+}
+
+/* ensure that active_array is big enough:
+ *  <distinct_fns> is the highest index, so <fn_active_array_size>
+ *  has to be bigger than that.
+ */
+static void resize_fn_array(void)
+{
+    UInt* new_array;
+    Int i;
+
+    UInt newsize = current_fn_active.size;
+    while (newsize <= TG_(stat).distinct_fns) newsize *=2;
+
+    TG_DEBUG(0, "Resize fn_active_array: %u => %u\n",
+	     current_fn_active.size, newsize);
+
+    new_array = (UInt*) TG_MALLOC("cl.fn.rfa.1", newsize * sizeof(UInt));
+    for(i=0;i<current_fn_active.size;i++)
+      new_array[i] = current_fn_active.array[i];
+    while(i<newsize)
+	new_array[i++] = 0;
+
+    VG_(free)(current_fn_active.array);
+    current_fn_active.size = newsize;
+    current_fn_active.array = new_array;
+    TG_(stat).fn_array_resizes++;
+}
+
+
diff --git a/tracegrind/global.h b/tracegrind/global.h
new file mode 100644
index 000000000..dd3659785
--- /dev/null
+++ b/tracegrind/global.h
@@ -0,0 +1,868 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind data structures, functions.               global.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2017 Josef Weidendorfer
+      josef.weidendorfer@gmx.de
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef TG_GLOBAL
+#define TG_GLOBAL
+
+#include "pub_tool_basics.h"
+#include "pub_tool_vki.h"
+#include "pub_tool_debuginfo.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcfile.h"
+#include "pub_tool_libcprint.h"
+#include "pub_tool_libcproc.h"
+#include "pub_tool_machine.h"
+#include "pub_tool_mallocfree.h"
+#include "pub_tool_options.h"
+#include "pub_tool_tooliface.h"
+#include "pub_tool_xarray.h"
+#include "pub_tool_clientstate.h"
+#include "pub_tool_machine.h"      // VG_(fnptr_to_fnentry)
+
+#include "events.h" // defines TG_ macro
+#include "costs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Tracegrind compile options                           --- */
+/*------------------------------------------------------------*/
+
+/* Enable debug output */
+#define TG_ENABLE_DEBUG 1
+
+/* Enable experimental features? */
+#define TG_EXPERIMENTAL 0
+
+
+/*------------------------------------------------------------*/
+/*--- Command line options                                 ---*/
+/*------------------------------------------------------------*/
+
+#define DEFAULT_OUTFORMAT   "tracegrind.out.%p"
+
+/* If and how to collect syscall time.
+   systime_no : do not collect systime
+   systime_msec : collect syscount, systime elapsed, milli second precision.
+   systime_usec : collect syscount, systime elapsed, micro second precision.
+   systime_nsec : collect syscount, systime elapsed, systime cpu, nano second
+                  precision.  */
+typedef enum {
+   systime_no,
+   systime_msec,
+   systime_usec,
+   systime_nsec
+} Collect_Systime;
+
+typedef struct _CommandLineOptions CommandLineOptions;
+struct _CommandLineOptions {
+
+  /* Dump format options */
+  const HChar* out_format;  /* Format string for tracegrind output file name */
+  Bool combine_dumps;       /* Dump trace parts into same file? */
+  Bool compress_strings;
+  Bool compress_events;
+  Bool compress_pos;
+  Bool mangle_names;
+  Bool compress_mangled;
+  Bool dump_line;
+  Bool dump_instr;
+  Bool dump_bb;
+  Bool dump_bbs;         /* Dump basic block information? */
+  
+  /* Dump generation options */
+  ULong dump_every_bb;     /* Dump every xxx BBs. */
+  
+  /* Collection options */
+  Bool separate_threads; /* Separate threads in dump? */
+  Int  separate_callers; /* Separate dependent on how many callers? */
+  Int  separate_recursions; /* Max level of recursions to separate */
+  Bool skip_plt;         /* Skip functions in PLT section? */
+  Bool skip_direct_recursion; /* Increment direct recursions the level? */
+
+  Bool collect_atstart;  /* Start in collecting state ? */
+  Bool collect_jumps;    /* Collect (cond.) jumps in functions ? */
+
+  Bool collect_alloc;    /* Collect size of allocated memory */
+  Collect_Systime collect_systime;  /* Collect time for system calls */
+
+  Bool collect_bus;      /* Collect global bus events */
+
+  /* Instrument options */
+  Bool instrument_atstart;  /* Instrument at start? */
+  Bool simulate_cache;      /* Call into cache simulator ? */
+  Bool simulate_branch;     /* Call into branch prediction simulator ? */
+
+  /* Call graph generation */
+  Bool pop_on_jump;       /* Handle a jump between functions as ret+call */
+  Int objs_to_skip_count; /* Number of objects to skip */
+  HChar** objs_to_skip;  /* List of objects to skip */
+
+#if TG_ENABLE_DEBUG
+  Int   verbose;
+  ULong verbose_start;
+#endif
+};
+
+/*------------------------------------------------------------*/
+/*--- Constants                                            ---*/
+/*------------------------------------------------------------*/
+
+/* Minimum cache line size allowed */
+#define MIN_LINE_SIZE   16
+
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _Statistics Statistics;
+struct _Statistics {
+  ULong call_counter;
+  ULong jcnd_counter;
+  ULong jump_counter;
+  ULong rec_call_counter;
+  ULong ret_counter;
+  ULong bb_executions;
+
+  Int  context_counter;
+  Int  bb_retranslations;  
+
+  Int  distinct_objs;
+  Int  distinct_files;
+  Int  distinct_fns;
+  Int  distinct_contexts;
+  Int  distinct_bbs;
+  Int  distinct_jccs;
+  Int  distinct_bbccs;
+  Int  distinct_instrs;
+  Int  distinct_skips;
+
+  Int  bb_hash_resizes;
+  Int  bbcc_hash_resizes;
+  Int  jcc_hash_resizes;
+  Int  cxt_hash_resizes;
+  Int  fn_array_resizes;
+  Int  call_stack_resizes;
+  Int  fn_stack_resizes;
+
+  Int  full_debug_BBs;
+  Int  file_line_debug_BBs;
+  Int  fn_name_debug_BBs;
+  Int  no_debug_BBs;
+  Int  bbcc_lru_misses;
+  Int  jcc_lru_misses;
+  Int  cxt_lru_misses;
+  Int  bbcc_clones;
+};
+
+
+/*------------------------------------------------------------*/
+/*--- Structure declarations                               ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _Context     Context;
+typedef struct _CC          CC;
+typedef struct _BB          BB;
+typedef struct _BBCC        BBCC;
+typedef struct _jCC         jCC;
+typedef struct _fCC         fCC;
+typedef struct _fn_node     fn_node;
+typedef struct _file_node   file_node;
+typedef struct _obj_node    obj_node;
+typedef struct _fn_config   fn_config;
+typedef struct _call_entry  call_entry;
+typedef struct _thread_info thread_info;
+
+/* Costs of event sets. Aliases to arrays of 64-bit values */
+typedef ULong* SimCost;  /* All events the simulator can produce */
+typedef ULong* UserCost;
+typedef ULong* FullCost; /* Simulator + User */
+
+
+/* The types of control flow changes that can happen between
+ * execution of two BBs in a thread.
+ */
+typedef enum {
+  jk_None = 0,   /* no explicit change by a guest instruction */
+  jk_Jump,       /* regular jump */
+  jk_Call,
+  jk_Return,
+  jk_CondJump    /* conditional jump taken (only used as jCC type) */
+} TgJumpKind;
+
+
+/* JmpCall cost center
+ * for subroutine call (from->bb->jmp_addr => to->bb->addr)
+ *
+ * Each BB has at most one CALL instruction. The list of JCC from
+ * this call is a pointer to the list head (stored in BBCC), and
+ * <next_from> in the JCC struct.
+ *
+ * For fast lookup, JCCs are reachable with a hash table, keyed by
+ * the (from_bbcc,to) pair. <next_hash> is used for the JCC chain
+ * of one hash table entry.
+ *
+ * Cost <sum> holds event counts for already returned executions.
+ * <last> are the event counters at last enter of the subroutine.
+ * <sum> is updated on returning from the subroutine by
+ * adding the diff of <last> and current event counters to <sum>.
+ *
+ * After updating, <last> is set to current event counters. Thus,
+ * events are not counted twice for recursive calls (TODO: True?)
+ */
+
+struct _jCC {
+  TgJumpKind jmpkind; /* jk_Call, jk_Jump, jk_CondJump */
+  jCC* next_hash;   /* for hash entry chain */
+  jCC* next_from;   /* next JCC from a BBCC */
+  BBCC *from, *to;  /* call arc from/to this BBCC */
+  UInt jmp;         /* jump no. in source */
+
+  ULong call_counter; /* no wraparound with 64 bit */
+  ULong creation_seq; /* creation order sequence number for correct dump order */
+
+  FullCost cost; /* simulator + user counters */
+};
+
+
+/* 
+ * Info for one instruction of a basic block.
+ */
+typedef struct _InstrInfo InstrInfo;
+struct _InstrInfo {
+  UInt instr_offset;
+  UInt instr_size;
+  UInt cost_offset;
+  EventSet* eventset;
+};
+
+
+
+/*
+ * Info for a side exit in a BB
+ */
+typedef struct _CJmpInfo CJmpInfo;
+struct _CJmpInfo {
+  UInt instr;          /* instruction index for BB.instr array */
+  TgJumpKind jmpkind; /* jump kind when leaving BB at this side exit */
+};
+
+
+/**
+ * An instrumented basic block (BB).
+ *
+ * BBs are put into a resizable hash to allow for fast detection if a
+ * BB is to be retranslated but cost info is already available.
+ * The key for a BB is a (object, offset) tupel making it independent
+ * from possibly multiple mappings of the same ELF object.
+ *
+ * At the beginning of each instrumented BB,
+ * a call to setup_bbcc(), specifying a pointer to the
+ * according BB structure, is added.
+ *
+ * As cost of a BB has to be distinguished depending on the context,
+ * multiple cost centers for one BB (struct BBCC) exist and the according
+ * BBCC is set by setup_bbcc.
+ */
+struct _BB {
+  obj_node*  obj;         /* ELF object of BB */
+  PtrdiffT   offset;      /* offset of BB in ELF object file */
+  BB*        next;       /* chaining for a hash entry */
+
+  VgSectKind sect_kind;  /* section of this BB, e.g. PLT */
+  UInt       instr_count;
+  
+  /* filled by TG_(get_fn_node) if debug info is available */
+  fn_node*   fn;          /* debug info for this BB */
+  UInt       line;
+  Bool       is_entry;    /* True if this BB is a function entry */
+        
+  BBCC*      bbcc_list;  /* BBCCs for same BB (see next_bbcc in BBCC) */
+  BBCC*      last_bbcc;  /* Temporary: Cached for faster access (LRU) */
+
+  /* filled by TG_(instrument) if not seen before */
+  UInt       cjmp_count;  /* number of side exits */
+  CJmpInfo*  jmp;         /* array of info for condition jumps,
+			   * allocated directly after this struct */
+  Bool       cjmp_inverted; /* is last side exit actually fall through? */
+
+  UInt       instr_len;
+  UInt       cost_count;
+  InstrInfo  instr[0];   /* info on instruction sizes and costs */
+};
+
+
+
+/**
+ * Function context
+ *
+ * Basic blocks are always executed in the scope of a context.
+ * A function context is a list of function nodes representing
+ * the call chain to the current context: I.e. fn[0] is the
+ * function we are currently in, fn[1] has called fn[0], and so on.
+ * Recursion levels are used for fn[0].
+ *
+ * To get a unique number for a full execution context, use
+ *  rec_index = min(<fn->rec_separation>,<active>) - 1;
+ *  unique_no = <number> + rec_index
+ *
+ * For each Context, recursion index and BB, there can be a BBCC.
+ */
+struct _Context {
+    UInt size;        // number of function dependencies
+    UInt base_number; // for context compression & dump array
+    Context* next;    // entry chaining for hash
+    UWord hash;       // for faster lookup...
+    fn_node* fn[0];
+};
+
+
+/*
+ * Cost info for a side exits from a BB
+ */
+typedef struct _JmpData JmpData;
+struct _JmpData {
+    ULong ecounter; /* number of times the BB was left at this exit */
+    jCC*  jcc_list; /* JCCs used for this exit */
+};
+
+
+/*
+ * Basic Block Cost Center
+ *
+ * On demand, multiple BBCCs will be created for the same BB
+ * dependent on command line options and:
+ * - current function (it's possible that a BB is executed in the
+ *   context of different functions, e.g. in manual assembler/PLT)
+ * - current thread ID
+ * - position where current function is called from
+ * - recursion level of current function
+ *
+ * The cost centres for the instructions of a basic block are
+ * stored in a contiguous array.
+ * They are distinguishable by their tag field.
+ */
+struct _BBCC {
+    BB*      bb;           /* BB for this cost center */
+
+    Context* cxt;          /* execution context of this BBCC */
+    ThreadId tid;          /* only for assertion check purpose */
+    UInt     rec_index;    /* Recursion index in rec->bbcc for this bbcc */
+    BBCC**   rec_array;    /* Variable sized array of pointers to 
+			    * recursion BBCCs. Shared. */
+    ULong    ret_counter;  /* how often returned from jccs of this bbcc;
+			    * used to check if a dump for this BBCC is needed */
+    
+    BBCC*    next_bbcc;    /* Chain of BBCCs for same BB */
+    BBCC*    lru_next_bbcc; /* BBCC executed next the last time */
+    
+    jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
+    jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
+    FullCost skipped;      /* cost for skipped functions called from 
+			    * jmp_addr. Allocated lazy */
+    
+    BBCC*    next;         /* entry chain in hash */
+    ULong*   cost;         /* start of 64bit costs for this BBCC */
+    ULong    ecounter_sum; /* execution counter for first instruction of BB */
+    JmpData  jmp[0];
+};
+
+
+/* the <number> of fn_node, file_node and obj_node are for compressed dumping
+ * and a index into the dump boolean table and fn_info_table
+ */
+
+struct _fn_node {
+  HChar*     name;
+  UInt       number;
+  Context*   last_cxt; /* LRU info */
+  Context*   pure_cxt; /* the context with only the function itself */
+  file_node* file;     /* reverse mapping for 2nd hash */
+  fn_node* next;
+
+  Bool dump_before :1;
+  Bool dump_after :1;
+  Bool zero_before :1;
+  Bool toggle_collect :1;
+  Bool skip :1;
+  Bool obj_skip_checked : 1;
+  Bool pop_on_jump : 1;
+
+  Bool is_malloc :1;
+  Bool is_realloc :1;
+  Bool is_free :1;
+
+  Int  group;
+  Int  separate_callers;
+  Int  separate_recursions;
+#if TG_ENABLE_DEBUG
+  Int  verbosity; /* Stores old verbosity level while in function */
+#endif
+};
+
+/* Quite arbitrary fixed hash sizes */
+
+#define   N_OBJ_ENTRIES         47
+#define  N_FILE_ENTRIES         53
+#define    N_FN_ENTRIES         87
+
+struct _file_node {
+   HChar*     name;
+   fn_node*   fns[N_FN_ENTRIES];
+   UInt       number;
+   obj_node*  obj;
+   file_node* next;
+};
+
+/* If an object is dlopened multiple times, we hope that <name> is unique;
+ * <start> and <offset> can change with each dlopen, and <start> is
+ * zero when object is unmapped (possible at dump time).
+ */
+struct _obj_node {
+   const HChar* name;
+   UInt       last_slash_pos;
+
+   Addr       start;  /* Start address of text segment mapping */
+   SizeT      size;   /* Length of mapping */
+   PtrdiffT   offset; /* Offset between symbol address and file offset */
+
+   file_node* files[N_FILE_ENTRIES];
+   UInt       number;
+   obj_node*  next;
+};
+
+/* an entry in the callstack
+ *
+ * <nonskipped> is 0 if the function called is not skipped (usual case).
+ * Otherwise, it is the last non-skipped BBCC. This one gets all
+ * the calls to non-skipped functions and all costs in skipped 
+ * instructions.
+ */
+struct _call_entry {
+    jCC* jcc;           /* jCC for this call */
+    FullCost enter_cost; /* cost event counters at entering frame */
+    Addr sp;            /* stack pointer directly after call */
+    Addr ret_addr;      /* address to which to return to
+			 * is 0 on a simulated call */
+    BBCC* nonskipped;   /* see above */
+    Context* cxt;       /* context before call */
+    Int fn_sp;          /* function stack index before call */
+};
+
+
+/*
+ * Execution state of main thread or a running signal handler in
+ * a thread while interrupted by another signal handler.
+ * As there's no scheduling among running signal handlers of one thread,
+ * we only need a subset of a full thread state:
+ * - event counter
+ * - collect state
+ * - last BB, last jump kind, last nonskipped BB
+ * - callstack pointer for sanity checking and correct unwinding
+ *   after exit
+ */
+typedef struct _exec_state exec_state;
+struct _exec_state {
+
+  /* the signum of the handler, 0 for main thread context
+   */
+  Int sig;
+  
+  /* the old call stack pointer at entering the signal handler */
+  Int orig_sp;
+  
+  FullCost cost;
+  Bool     collect;
+  Context* cxt;
+  
+  /* number of conditional jumps passed in last BB */
+  Int   jmps_passed;
+  BBCC* bbcc;      /* last BB executed */
+  BBCC* nonskipped;
+
+  Int call_stack_bottom; /* Index into fn_stack */
+};
+
+/* Global state structures */
+typedef struct _bb_hash bb_hash;
+struct _bb_hash {
+  UInt size, entries;
+  BB** table;
+};
+
+typedef struct _cxt_hash cxt_hash;
+struct _cxt_hash {
+  UInt size, entries;
+  Context** table;
+};  
+
+/* Thread specific state structures, i.e. parts of a thread state.
+ * There are variables for the current state of each part,
+ * on which a thread state is copied at thread switch.
+ */
+typedef struct _bbcc_hash bbcc_hash;
+struct _bbcc_hash {
+  UInt size, entries;
+  BBCC** table;
+};
+
+typedef struct _jcc_hash jcc_hash;
+struct _jcc_hash {
+  UInt size, entries;
+  jCC** table;
+  jCC* spontaneous;
+};
+
+typedef struct _fn_array fn_array;
+struct _fn_array {
+  UInt size;
+  UInt* array;
+};
+
+typedef struct _call_stack call_stack;
+struct _call_stack {
+  UInt size;
+  Int sp;
+  call_entry* entry;
+};
+
+typedef struct _fn_stack fn_stack;
+struct _fn_stack {
+  UInt size;
+  fn_node **bottom, **top;
+};
+
+/* The maximum number of simultaneous running signal handlers per thread.
+ * This is the number of execution states storable in a thread.
+ */
+#define MAX_SIGHANDLERS 10
+
+typedef struct _exec_stack exec_stack;
+struct _exec_stack {
+  Int sp; /* > 0 if a handler is running */
+  exec_state* entry[MAX_SIGHANDLERS];
+};
+
+/* Thread State 
+ *
+ * This structure stores thread specific info while a thread is *not*
+ * running. See function switch_thread() for save/restore on thread switch.
+ *
+ * If --separate-threads=no, BBCCs and JCCs can be shared by all threads, i.e.
+ * only structures of thread 1 are used.
+ * This involves variables fn_info_table, bbcc_table and jcc_table.
+ */
+struct _thread_info {
+
+  /* state */
+  fn_stack fns;       /* function stack */
+  call_stack calls;   /* context call arc stack */
+  exec_stack states;  /* execution states interrupted by signals */
+
+  /* dump statistics */
+  FullCost lastdump_cost;    /* Cost at last dump */
+  FullCost sighandler_cost;
+
+  /* thread specific data structure containers */
+  fn_array fn_active;
+  jcc_hash jccs;
+  bbcc_hash bbccs;
+};
+
+/* Structs used for dumping */
+
+/* Address position inside of a BBCC:
+ * This includes
+ * - the address offset from the BB start address
+ * - file/line from debug info for that address (can change inside a BB)
+ */
+typedef struct _AddrPos AddrPos;
+struct _AddrPos {
+    Addr addr;
+    Addr bb_addr;
+    file_node* file;
+    UInt line;
+};
+
+/* a simulator cost entity that can be written out in one line */
+typedef struct _AddrCost AddrCost;
+struct _AddrCost {
+    AddrPos p;
+    SimCost cost;
+};
+
+/* A function in an execution context */
+typedef struct _FnPos FnPos;
+struct _FnPos {
+    file_node* file;
+    fn_node* fn;
+    obj_node* obj;
+    Context* cxt;
+    int rec_index;
+    UInt line;
+};
+
+/*------------------------------------------------------------*/
+/*--- Cache simulator interface                            ---*/
+/*------------------------------------------------------------*/
+
+struct cachesim_if
+{
+    void (*print_opts)(void);
+    Bool (*parse_opt)(const HChar* arg);
+    void (*post_clo_init)(void);
+    void (*clear)(void);
+    void (*dump_desc)(VgFile *fp);
+    void (*printstat)(Int,Int,Int);
+    void (*add_icost)(SimCost, BBCC*, InstrInfo*, ULong);
+    void (*finish)(void);
+    
+    void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
+    void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
+    void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
+
+    void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+    void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+
+    void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+    void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+
+    // function names of helpers (for debugging generated code)
+    const HChar *log_1I0D_name, *log_2I0D_name, *log_3I0D_name;
+    const HChar *log_1I1Dr_name, *log_1I1Dw_name;
+    const HChar *log_0I1Dr_name, *log_0I1Dw_name;
+};
+
+// Event groups
+#define EG_USE   0
+#define EG_IR    1
+#define EG_DR    2
+#define EG_DW    3
+#define EG_BC    4
+#define EG_BI    5
+#define EG_BUS   6
+#define EG_ALLOC 7
+#define EG_SYS   8
+
+struct event_sets {
+    EventSet *base, *full;
+};
+
+#define fullOffset(group) (TG_(sets).full->offset[group])
+
+
+/*------------------------------------------------------------*/
+/*--- Functions                                            ---*/
+/*------------------------------------------------------------*/
+
+/* from clo.c */
+
+void TG_(set_clo_defaults)(void);
+void TG_(update_fn_config)(fn_node*);
+Bool TG_(process_cmd_line_option)(const HChar*);
+void TG_(print_usage)(void);
+void TG_(print_debug_usage)(void);
+
+/* from sim.c */
+void TG_(init_eventsets)(void);
+
+/* from main.c */
+Bool TG_(get_debug_info)(Addr, const HChar **dirname,
+                          const HChar **filename,
+                          const HChar **fn_name, UInt*, DebugInfo**);
+void TG_(collectBlockInfo)(IRSB* bbIn, UInt*, UInt*, Bool*);
+void TG_(set_instrument_state)(const HChar*,Bool);
+void TG_(dump_profile)(const HChar* trigger,Bool only_current_thread);
+void TG_(zero_all_cost)(Bool only_current_thread);
+Int TG_(get_dump_counter)(void);
+void TG_(fini)(Int exitcode);
+
+/* from bb.c */
+void TG_(init_bb_hash)(void);
+bb_hash* TG_(get_bb_hash)(void);
+BB*  TG_(get_bb)(Addr addr, IRSB* bb_in, Bool *seen_before);
+void TG_(delete_bb)(Addr addr);
+
+static __inline__ Addr bb_addr(BB* bb)
+ { return bb->offset + bb->obj->offset; }
+static __inline__ Addr bb_jmpaddr(BB* bb)
+ { UInt off = (bb->instr_count > 0) ? bb->instr[bb->instr_count-1].instr_offset : 0;
+   return off + bb->offset + bb->obj->offset; }
+
+/* from fn.c */
+void TG_(init_fn_array)(fn_array*);
+void TG_(copy_current_fn_array)(fn_array* dst);
+fn_array* TG_(get_current_fn_array)(void);
+void TG_(set_current_fn_array)(fn_array*);
+UInt* TG_(get_fn_entry)(Int n);
+
+void      TG_(init_obj_table)(void);
+obj_node* TG_(get_obj_node)(DebugInfo* si);
+file_node* TG_(get_file_node)(obj_node*, const HChar *dirname,
+                               const HChar* filename);
+fn_node*  TG_(get_fn_node)(BB* bb);
+
+/* from bbcc.c */
+void TG_(init_bbcc_hash)(bbcc_hash* bbccs);
+void TG_(copy_current_bbcc_hash)(bbcc_hash* dst);
+bbcc_hash* TG_(get_current_bbcc_hash)(void);
+void TG_(set_current_bbcc_hash)(bbcc_hash*);
+void TG_(forall_bbccs)(void (*func)(BBCC*));
+void TG_(zero_bbcc)(BBCC* bbcc);
+BBCC* TG_(get_bbcc)(BB* bb);
+BBCC* TG_(clone_bbcc)(BBCC* orig, Context* cxt, Int rec_index);
+void TG_(setup_bbcc)(BB* bb) VG_REGPARM(1);
+
+
+/* from jumps.c */
+void TG_(init_jcc_hash)(jcc_hash*);
+void TG_(copy_current_jcc_hash)(jcc_hash* dst);
+void TG_(set_current_jcc_hash)(jcc_hash*);
+jCC* TG_(get_jcc)(BBCC* from, UInt, BBCC* to);
+
+/* from callstack.c */
+void TG_(init_call_stack)(call_stack*);
+void TG_(copy_current_call_stack)(call_stack* dst);
+void TG_(set_current_call_stack)(call_stack*);
+call_entry* TG_(get_call_entry)(Int n);
+
+void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip);
+void TG_(pop_call_stack)(void);
+Int TG_(unwind_call_stack)(Addr sp, Int);
+
+/* from context.c */
+void TG_(init_fn_stack)(fn_stack*);
+void TG_(copy_current_fn_stack)(fn_stack*);
+void TG_(set_current_fn_stack)(fn_stack*);
+
+void TG_(init_cxt_table)(void);
+Context* TG_(get_cxt)(fn_node** fn);
+void TG_(push_cxt)(fn_node* fn);
+
+/* from threads.c */
+void TG_(init_threads)(void);
+thread_info** TG_(get_threads)(void);
+thread_info* TG_(get_current_thread)(void);
+void TG_(switch_thread)(ThreadId tid);
+void TG_(forall_threads)(void (*func)(thread_info*));
+void TG_(run_thread)(ThreadId tid);
+
+void TG_(init_exec_state)(exec_state* es);
+void TG_(init_exec_stack)(exec_stack*);
+void TG_(copy_current_exec_stack)(exec_stack*);
+void TG_(set_current_exec_stack)(exec_stack*);
+void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack);
+void TG_(post_signal)(ThreadId tid, Int sigNum);
+void TG_(run_post_signal_on_call_stack_bottom)(void);
+
+/* from dump.c */
+void TG_(init_dumps)(void);
+
+/*------------------------------------------------------------*/
+/*--- Exported global variables                            ---*/
+/*------------------------------------------------------------*/
+
+extern CommandLineOptions TG_(clo);
+extern Statistics TG_(stat);
+extern EventMapping* TG_(dumpmap);
+
+/* Function active counter array, indexed by function number */
+extern UInt* TG_(fn_active_array);
+extern Bool TG_(instrument_state);
+ /* min of L1 and LL cache line sizes */
+extern Int TG_(min_line_size);
+extern call_stack TG_(current_call_stack);
+extern fn_stack   TG_(current_fn_stack);
+extern exec_state TG_(current_state);
+extern ThreadId   TG_(current_tid);
+extern FullCost   TG_(total_cost);
+extern struct cachesim_if TG_(cachesim);
+extern struct event_sets  TG_(sets);
+
+// set by setup_bbcc at start of every BB, and needed by log_* helpers
+extern Addr   TG_(bb_base);
+extern ULong* TG_(cost_base);
+
+
+/*------------------------------------------------------------*/
+/*--- Debug output                                         ---*/
+/*------------------------------------------------------------*/
+
+#if TG_ENABLE_DEBUG
+
+#define TG_DEBUGIF(x) \
+  if (UNLIKELY( (TG_(clo).verbose >x) && \
+                (TG_(stat).bb_executions >= TG_(clo).verbose_start)))
+
+#define TG_DEBUG(x,format,args...)   \
+    TG_DEBUGIF(x) {                  \
+      TG_(print_bbno)();	      \
+      VG_(printf)(format,##args);     \
+    }
+
+#define TG_ASSERT(cond)              \
+    if (UNLIKELY(!(cond))) {          \
+      TG_(print_context)();          \
+      TG_(print_bbno)();	      \
+      tl_assert(cond);                \
+     }
+
+#else
+#define TG_DEBUGIF(x) if (0)
+#define TG_DEBUG(x...) {}
+#define TG_ASSERT(cond) tl_assert(cond);
+#endif
+
+/* from debug.c */
+void TG_(print_bbno)(void);
+void TG_(print_context)(void);
+void TG_(print_jcc)(int s, jCC* jcc);
+void TG_(print_bbcc)(int s, BBCC* bbcc);
+void TG_(print_bbcc_fn)(BBCC* bbcc);
+void TG_(print_execstate)(int s, exec_state* es);
+void TG_(print_eventset)(int s, EventSet* es);
+void TG_(print_cost)(int s, EventSet*, ULong* cost);
+void TG_(print_bb)(int s, BB* bb);
+void TG_(print_bbcc_cost)(int s, BBCC*);
+void TG_(print_cxt)(int s, Context* cxt, int rec_index);
+void TG_(print_short_jcc)(jCC* jcc);
+void TG_(print_stackentry)(int s, int sp);
+void TG_(print_addr)(Addr addr);
+void TG_(print_addr_ln)(Addr addr);
+
+void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f);
+void* TG_(free)(void* p, const HChar* f);
+#if 0
+#define TG_MALLOC(_cc,x) TG_(malloc)((_cc),x,__FUNCTION__)
+#define TG_FREE(p)       TG_(free)(p,__FUNCTION__)
+#else
+#define TG_MALLOC(_cc,x) VG_(malloc)((_cc),x)
+#define TG_FREE(p)       VG_(free)(p)
+#endif
+
+#endif /* TG_GLOBAL */
diff --git a/tracegrind/jumps.c b/tracegrind/jumps.c
new file mode 100644
index 000000000..d8ee30369
--- /dev/null
+++ b/tracegrind/jumps.c
@@ -0,0 +1,235 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                   ct_jumps.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Jump Cost Center (JCC) operations, including Calls   ---*/
+/*------------------------------------------------------------*/
+
+#define N_JCC_INITIAL_ENTRIES  4437
+
+static jcc_hash current_jccs;
+
+/* Global counter for jCC creation sequence to preserve chronological order */
+static ULong jcc_creation_counter = 0;
+
+void TG_(init_jcc_hash)(jcc_hash* jccs)
+{
+   Int i;
+
+   TG_ASSERT(jccs != 0);
+
+   jccs->size    = N_JCC_INITIAL_ENTRIES;
+   jccs->entries = 0;
+   jccs->table = (jCC**) TG_MALLOC("cl.jumps.ijh.1",
+                                    jccs->size * sizeof(jCC*));
+   jccs->spontaneous = 0;
+
+   for (i = 0; i < jccs->size; i++)
+     jccs->table[i] = 0;
+}
+
+
+void TG_(copy_current_jcc_hash)(jcc_hash* dst)
+{
+  TG_ASSERT(dst != 0);
+
+  dst->size        = current_jccs.size;
+  dst->entries     = current_jccs.entries;
+  dst->table       = current_jccs.table;
+  dst->spontaneous = current_jccs.spontaneous;
+}
+
+void TG_(set_current_jcc_hash)(jcc_hash* h)
+{
+  TG_ASSERT(h != 0);
+
+  current_jccs.size        = h->size;
+  current_jccs.entries     = h->entries;
+  current_jccs.table       = h->table;
+  current_jccs.spontaneous = h->spontaneous;
+}
+
+__inline__
+static UInt jcc_hash_idx(BBCC* from, UInt jmp, BBCC* to, UInt size)
+{
+  return (UInt) ( (UWord)from + 7* (UWord)to + 13*jmp) % size;
+} 
+
+/* double size of jcc table  */
+static void resize_jcc_table(void)
+{
+    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
+    jCC** new_table;
+    UInt new_idx;
+    jCC *curr_jcc, *next_jcc;
+
+    new_size  = 2* current_jccs.size +3;
+    new_table = (jCC**) TG_MALLOC("cl.jumps.rjt.1",
+                                   new_size * sizeof(jCC*));
+ 
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+ 
+    for (i = 0; i < current_jccs.size; i++) {
+	if (current_jccs.table[i] == NULL) continue;
+ 
+	curr_jcc = current_jccs.table[i];
+	while (NULL != curr_jcc) {
+	    next_jcc = curr_jcc->next_hash;
+
+	    new_idx = jcc_hash_idx(curr_jcc->from, curr_jcc->jmp,
+				    curr_jcc->to, new_size);
+
+	    curr_jcc->next_hash = new_table[new_idx];
+	    new_table[new_idx] = curr_jcc;
+	    if (curr_jcc->next_hash) {
+		conflicts1++;
+		if (curr_jcc->next_hash->next_hash)
+		    conflicts2++;
+	    }
+
+	    curr_jcc = next_jcc;
+	}
+    }
+
+    VG_(free)(current_jccs.table);
+
+
+    TG_DEBUG(0, "Resize JCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+	     current_jccs.size, new_size,
+	     current_jccs.entries, conflicts1, conflicts2);
+
+    current_jccs.size  = new_size;
+    current_jccs.table = new_table;
+    TG_(stat).jcc_hash_resizes++;
+}
+
+
+
+/* new jCC structure: a call was done to a BB of a BBCC 
+ * for a spontaneous call, from is 0 (i.e. caller unknown)
+ */
+static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to)
+{
+   jCC* jcc;
+   UInt new_idx;
+
+   /* check fill degree of jcc hash table and resize if needed (>80%) */
+   current_jccs.entries++;
+   if (10 * current_jccs.entries / current_jccs.size > 8)
+       resize_jcc_table();
+
+   jcc = (jCC*) TG_MALLOC("cl.jumps.nj.1", sizeof(jCC));
+
+   jcc->from      = from;
+   jcc->jmp       = jmp;
+   jcc->to        = to;
+   jcc->jmpkind   = jk_Call;
+   jcc->call_counter = 0;
+   jcc->creation_seq = jcc_creation_counter++;
+   jcc->cost = 0;
+
+   /* insert into JCC chain of calling BBCC.
+    * This list is only used at dumping time */
+
+   if (from) {
+       /* Prohibit corruption by array overrun */
+       TG_ASSERT(jmp <= from->bb->cjmp_count);
+       jcc->next_from = from->jmp[jmp].jcc_list;
+       from->jmp[jmp].jcc_list = jcc;
+   }
+   else {
+       jcc->next_from = current_jccs.spontaneous;
+       current_jccs.spontaneous = jcc;
+   }
+
+   /* insert into JCC hash table */
+   new_idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
+   jcc->next_hash = current_jccs.table[new_idx];
+   current_jccs.table[new_idx] = jcc;
+
+   TG_(stat).distinct_jccs++;
+
+   TG_DEBUGIF(3) {
+     VG_(printf)("  new_jcc (now %d): %p\n",
+		 TG_(stat).distinct_jccs, jcc);
+   }
+
+   return jcc;
+}
+
+
+/* get the jCC for a call arc (BBCC->BBCC) */
+jCC* TG_(get_jcc)(BBCC* from, UInt jmp, BBCC* to)
+{
+    jCC* jcc;
+    UInt idx;
+
+    TG_DEBUG(5, "+ get_jcc(bbcc %p/%u => bbcc %p)\n",
+		from, jmp, to);
+
+    /* first check last recently used JCC */
+    jcc = to->lru_to_jcc;
+    if (jcc && (jcc->from == from) && (jcc->jmp == jmp)) {
+	TG_ASSERT(to == jcc->to);
+	TG_DEBUG(5,"- get_jcc: [LRU to] jcc %p\n", jcc);
+	return jcc;
+    }
+
+    jcc = from->lru_from_jcc;
+    if (jcc && (jcc->to == to) && (jcc->jmp == jmp)) {
+	TG_ASSERT(from == jcc->from);
+	TG_DEBUG(5, "- get_jcc: [LRU from] jcc %p\n", jcc);
+	return jcc;
+    }
+
+    TG_(stat).jcc_lru_misses++;
+
+    idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
+    jcc = current_jccs.table[idx];
+
+    while(jcc) {
+	if ((jcc->from == from) &&
+	    (jcc->jmp == jmp) &&
+	    (jcc->to == to)) break;
+	jcc = jcc->next_hash;
+    }
+
+    if (!jcc)
+	jcc = new_jcc(from, jmp, to);
+
+    /* set LRU */
+    from->lru_from_jcc = jcc;
+    to->lru_to_jcc = jcc;
+
+    TG_DEBUG(5, "- get_jcc(bbcc %p => bbcc %p)\n",
+		from, to);
+
+    return jcc;
+}
+
diff --git a/tracegrind/main.c b/tracegrind/main.c
new file mode 100644
index 000000000..c6fff12f5
--- /dev/null
+++ b/tracegrind/main.c
@@ -0,0 +1,2154 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                       main.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "tracegrind.h"
+#include "global.h"
+
+#include "pub_tool_threadstate.h"
+#include "pub_tool_gdbserver.h"
+#include "pub_tool_transtab.h"       // VG_(discard_translations_safely)
+
+#include "cg_branchpred.c"
+
+/*------------------------------------------------------------*/
+/*--- Global variables                                     ---*/
+/*------------------------------------------------------------*/
+
+/* for all threads */
+CommandLineOptions TG_(clo);
+Statistics TG_(stat);
+Bool TG_(instrument_state) = True; /* Instrumentation on ? */
+
+/* thread and signal handler specific */
+exec_state TG_(current_state);
+
+/* min of L1 and LL cache line sizes.  This only gets set to a
+   non-zero value if we are doing cache simulation. */
+Int TG_(min_line_size) = 0;
+
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+static void TG_(init_statistics)(Statistics* s)
+{
+  s->call_counter        = 0;
+  s->jcnd_counter        = 0;
+  s->jump_counter        = 0;
+  s->rec_call_counter    = 0;
+  s->ret_counter         = 0;
+  s->bb_executions       = 0;
+
+  s->context_counter     = 0;
+  s->bb_retranslations   = 0;
+
+  s->distinct_objs       = 0;
+  s->distinct_files      = 0;
+  s->distinct_fns        = 0;
+  s->distinct_contexts   = 0;
+  s->distinct_bbs        = 0;
+  s->distinct_bbccs      = 0;
+  s->distinct_instrs     = 0;
+  s->distinct_skips      = 0;
+
+  s->bb_hash_resizes     = 0;
+  s->bbcc_hash_resizes   = 0;
+  s->jcc_hash_resizes    = 0;
+  s->cxt_hash_resizes    = 0;
+  s->fn_array_resizes    = 0;
+  s->call_stack_resizes  = 0;
+  s->fn_stack_resizes    = 0;
+
+  s->full_debug_BBs      = 0;
+  s->file_line_debug_BBs = 0;
+  s->fn_name_debug_BBs   = 0;
+  s->no_debug_BBs        = 0;
+  s->bbcc_lru_misses     = 0;
+  s->jcc_lru_misses      = 0;
+  s->cxt_lru_misses      = 0;
+  s->bbcc_clones         = 0;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Simple callbacks (not cache similator)               ---*/
+/*------------------------------------------------------------*/
+
+VG_REGPARM(1)
+static void log_global_event(InstrInfo* ii)
+{
+    ULong* cost_Bus;
+
+    TG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
+              TG_(bb_base) + ii->instr_offset, ii->instr_size);
+
+    if (!TG_(current_state).collect) return;
+
+    TG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
+
+    TG_(current_state).cost[ fullOffset(EG_BUS) ]++;
+
+    if (TG_(current_state).nonskipped)
+        cost_Bus = TG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
+    else
+        cost_Bus = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
+    cost_Bus[0]++;
+}
+
+
+/* For branches, we consult two different predictors, one which
+   predicts taken/untaken for conditional branches, and the other
+   which predicts the branch target address for indirect branches
+   (jump-to-register style ones). */
+
+static VG_REGPARM(2)
+void log_cond_branch(InstrInfo* ii, Word taken)
+{
+    Bool miss;
+    Int fullOffset_Bc;
+    ULong* cost_Bc;
+
+    TG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %ld\n",
+              TG_(bb_base) + ii->instr_offset, taken);
+
+    miss = 1 & do_cond_branch_predict(TG_(bb_base) + ii->instr_offset, taken);
+
+    if (!TG_(current_state).collect) return;
+
+    TG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
+
+    if (TG_(current_state).nonskipped)
+        cost_Bc = TG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
+    else
+        cost_Bc = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
+
+    fullOffset_Bc = fullOffset(EG_BC);
+    TG_(current_state).cost[ fullOffset_Bc ]++;
+    cost_Bc[0]++;
+    if (miss) {
+        TG_(current_state).cost[ fullOffset_Bc+1 ]++;
+        cost_Bc[1]++;
+    }
+}
+
+static VG_REGPARM(2)
+void log_ind_branch(InstrInfo* ii, UWord actual_dst)
+{
+    Bool miss;
+    Int fullOffset_Bi;
+    ULong* cost_Bi;
+
+    TG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
+              TG_(bb_base) + ii->instr_offset, actual_dst);
+
+    miss = 1 & do_ind_branch_predict(TG_(bb_base) + ii->instr_offset, actual_dst);
+
+    if (!TG_(current_state).collect) return;
+
+    TG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
+
+    if (TG_(current_state).nonskipped)
+        cost_Bi = TG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
+    else
+        cost_Bi = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
+
+    fullOffset_Bi = fullOffset(EG_BI);
+    TG_(current_state).cost[ fullOffset_Bi ]++;
+    cost_Bi[0]++;
+    if (miss) {
+        TG_(current_state).cost[ fullOffset_Bi+1 ]++;
+        cost_Bi[1]++;
+    }
+}
+
+/*------------------------------------------------------------*/
+/*--- Instrumentation structures and event queue handling  ---*/
+/*------------------------------------------------------------*/
+
+/* Maintain an ordered list of memory events which are outstanding, in
+   the sense that no IR has yet been generated to do the relevant
+   helper calls.  The BB is scanned top to bottom and memory events
+   are added to the end of the list, merging with the most recent
+   notified event where possible (Dw immediately following Dr and
+   having the same size and EA can be merged).
+
+   This merging is done so that for architectures which have
+   load-op-store instructions (x86, amd64), the insn is treated as if
+   it makes just one memory reference (a modify), rather than two (a
+   read followed by a write at the same address).
+
+   At various points the list will need to be flushed, that is, IR
+   generated from it.  That must happen before any possible exit from
+   the block (the end, or an IRStmt_Exit).  Flushing also takes place
+   when there is no space to add a new event.
+
+   If we require the simulation statistics to be up to date with
+   respect to possible memory exceptions, then the list would have to
+   be flushed before each memory reference.  That would however lose
+   performance by inhibiting event-merging during flushing.
+
+   Flushing the list consists of walking it start to end and emitting
+   instrumentation IR for each event, in the order in which they
+   appear.  It may be possible to emit a single call for two adjacent
+   events in order to reduce the number of helper function calls made.
+   For example, it could well be profitable to handle two adjacent Ir
+   events with a single helper call.  */
+
+typedef
+   IRExpr
+   IRAtom;
+
+typedef
+   enum {
+      Ev_Ir,  // Instruction read
+      Ev_Dr,  // Data read
+      Ev_Dw,  // Data write
+      Ev_Dm,  // Data modify (read then write)
+      Ev_Bc,  // branch conditional
+      Ev_Bi,  // branch indirect (to unknown destination)
+      Ev_G    // Global bus event
+   }
+   EventTag;
+
+typedef
+   struct {
+      EventTag   tag;
+      InstrInfo* inode;
+      union {
+	 struct {
+	 } Ir;
+	 struct {
+	    IRAtom* ea;
+	    Int     szB;
+	 } Dr;
+	 struct {
+	    IRAtom* ea;
+	    Int     szB;
+	 } Dw;
+	 struct {
+	    IRAtom* ea;
+	    Int     szB;
+	 } Dm;
+         struct {
+            IRAtom* taken; /* :: Ity_I1 */
+         } Bc;
+         struct {
+            IRAtom* dst;
+         } Bi;
+	 struct {
+	 } G;
+      } Ev;
+   }
+   Event;
+
+static void init_Event ( Event* ev ) {
+   VG_(memset)(ev, 0, sizeof(Event));
+}
+
+static IRAtom* get_Event_dea ( Event* ev ) {
+   switch (ev->tag) {
+      case Ev_Dr: return ev->Ev.Dr.ea;
+      case Ev_Dw: return ev->Ev.Dw.ea;
+      case Ev_Dm: return ev->Ev.Dm.ea;
+      default:    tl_assert(0);
+   }
+}
+
+static Int get_Event_dszB ( Event* ev ) {
+   switch (ev->tag) {
+      case Ev_Dr: return ev->Ev.Dr.szB;
+      case Ev_Dw: return ev->Ev.Dw.szB;
+      case Ev_Dm: return ev->Ev.Dm.szB;
+      default:    tl_assert(0);
+   }
+}
+
+
+/* Up to this many unnotified events are allowed.  Number is
+   arbitrary.  Larger numbers allow more event merging to occur, but
+   potentially induce more spilling due to extending live ranges of
+   address temporaries. */
+#define N_EVENTS 16
+
+
+/* A struct which holds all the running state during instrumentation.
+   Mostly to avoid passing loads of parameters everywhere. */
+typedef struct {
+    /* The current outstanding-memory-event list. */
+    Event events[N_EVENTS];
+    Int   events_used;
+
+    /* The array of InstrInfo's is part of BB struct. */
+    BB* bb;
+
+    /* BB seen before (ie. re-instrumentation) */
+    Bool seen_before;
+
+    /* Number InstrInfo bins 'used' so far. */
+    UInt ii_index;
+
+    // current offset of guest instructions from BB start
+    UInt instr_offset;
+
+    /* The output SB being constructed. */
+    IRSB* sbOut;
+} ClgState;
+
+
+static void showEvent ( Event* ev )
+{
+   switch (ev->tag) {
+      case Ev_Ir:
+	 VG_(printf)("Ir (InstrInfo %p) at +%u\n",
+		     ev->inode, ev->inode->instr_offset);
+	 break;
+      case Ev_Dr:
+	 VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=",
+		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
+	 ppIRExpr(ev->Ev.Dr.ea);
+	 VG_(printf)("\n");
+	 break;
+      case Ev_Dw:
+	 VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=",
+		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
+	 ppIRExpr(ev->Ev.Dw.ea);
+	 VG_(printf)("\n");
+	 break;
+      case Ev_Dm:
+	 VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=",
+		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
+	 ppIRExpr(ev->Ev.Dm.ea);
+	 VG_(printf)("\n");
+	 break;
+      case Ev_Bc:
+         VG_(printf)("Bc %p   GA=", ev->inode);
+         ppIRExpr(ev->Ev.Bc.taken);
+         VG_(printf)("\n");
+         break;
+      case Ev_Bi:
+         VG_(printf)("Bi %p  DST=", ev->inode);
+         ppIRExpr(ev->Ev.Bi.dst);
+         VG_(printf)("\n");
+         break;
+      case Ev_G:
+         VG_(printf)("G  %p\n", ev->inode);
+         break;
+      default:
+	 tl_assert(0);
+	 break;
+   }
+}
+
+/* Generate code for all outstanding memory events, and mark the queue
+   empty.  Code is generated into cgs->sbOut, and this activity
+   'consumes' slots in cgs->bb. */
+
+static void flushEvents ( ClgState* clgs )
+{
+   Int        i, regparms, inew;
+   const HChar* helperName;
+   void*      helperAddr;
+   IRExpr**   argv;
+   IRExpr*    i_node_expr;
+   IRDirty*   di;
+   Event*     ev;
+   Event*     ev2;
+   Event*     ev3;
+
+   if (!clgs->seen_before) {
+       // extend event sets as needed
+       // available sets: D0 Dr
+       for(i=0; i<clgs->events_used; i++) {
+	   ev  = &clgs->events[i];
+	   switch(ev->tag) {
+	   case Ev_Ir:
+	       // Ir event always is first for a guest instruction
+	       TG_ASSERT(ev->inode->eventset == 0);
+	       ev->inode->eventset = TG_(sets).base;
+	       break;
+	   case Ev_Dr:
+               // extend event set by Dr counters
+	       ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
+							   EG_DR);
+	       break;
+	   case Ev_Dw:
+	   case Ev_Dm:
+               // extend event set by Dw counters
+	       ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
+							   EG_DW);
+	       break;
+           case Ev_Bc:
+               // extend event set by Bc counters
+               ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
+                                                           EG_BC);
+               break;
+           case Ev_Bi:
+               // extend event set by Bi counters
+               ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
+                                                           EG_BI);
+               break;
+	   case Ev_G:
+               // extend event set by Bus counter
+	       ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
+							   EG_BUS);
+	       break;
+	   default:
+	       tl_assert(0);
+	   }
+       }
+   }
+
+   for(i = 0; i < clgs->events_used; i = inew) {
+
+      helperName = NULL;
+      helperAddr = NULL;
+      argv       = NULL;
+      regparms   = 0;
+
+      /* generate IR to notify event i and possibly the ones
+	 immediately following it. */
+      tl_assert(i >= 0 && i < clgs->events_used);
+
+      ev  = &clgs->events[i];
+      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
+      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
+
+      TG_DEBUGIF(5) {
+	 VG_(printf)("   flush ");
+	 showEvent( ev );
+      }
+
+      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
+
+      /* Decide on helper fn to call and args to pass it, and advance
+	 i appropriately.
+	 Dm events have same effect as Dw events */
+      switch (ev->tag) {
+	 case Ev_Ir:
+	    /* Merge an Ir with a following Dr. */
+	    if (ev2 && ev2->tag == Ev_Dr) {
+	       /* Why is this true?  It's because we're merging an Ir
+		  with a following Dr.  The Ir derives from the
+		  instruction's IMark and the Dr from data
+		  references which follow it.  In short it holds
+		  because each insn starts with an IMark, hence an
+		  Ev_Ir, and so these Dr must pertain to the
+		  immediately preceding Ir.  Same applies to analogous
+		  assertions in the subsequent cases. */
+	       tl_assert(ev2->inode == ev->inode);
+	       helperName = TG_(cachesim).log_1I1Dr_name;
+	       helperAddr = TG_(cachesim).log_1I1Dr;
+	       argv = mkIRExprVec_3( i_node_expr,
+				     get_Event_dea(ev2),
+				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
+	       regparms = 3;
+	       inew = i+2;
+	    }
+	    /* Merge an Ir with a following Dw/Dm. */
+	    else
+	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
+	       tl_assert(ev2->inode == ev->inode);
+	       helperName = TG_(cachesim).log_1I1Dw_name;
+	       helperAddr = TG_(cachesim).log_1I1Dw;
+	       argv = mkIRExprVec_3( i_node_expr,
+				     get_Event_dea(ev2),
+				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
+	       regparms = 3;
+	       inew = i+2;
+	    }
+	    /* Merge an Ir with two following Irs. */
+	    else
+	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
+	       helperName = TG_(cachesim).log_3I0D_name;
+	       helperAddr = TG_(cachesim).log_3I0D;
+	       argv = mkIRExprVec_3( i_node_expr,
+				     mkIRExpr_HWord( (HWord)ev2->inode ),
+				     mkIRExpr_HWord( (HWord)ev3->inode ) );
+	       regparms = 3;
+	       inew = i+3;
+	    }
+	    /* Merge an Ir with one following Ir. */
+	    else
+	    if (ev2 && ev2->tag == Ev_Ir) {
+	       helperName = TG_(cachesim).log_2I0D_name;
+	       helperAddr = TG_(cachesim).log_2I0D;
+	       argv = mkIRExprVec_2( i_node_expr,
+				     mkIRExpr_HWord( (HWord)ev2->inode ) );
+	       regparms = 2;
+	       inew = i+2;
+	    }
+	    /* No merging possible; emit as-is. */
+	    else {
+	       helperName = TG_(cachesim).log_1I0D_name;
+	       helperAddr = TG_(cachesim).log_1I0D;
+	       argv = mkIRExprVec_1( i_node_expr );
+	       regparms = 1;
+	       inew = i+1;
+	    }
+	    break;
+	 case Ev_Dr:
+	    /* Data read or modify */
+	    helperName = TG_(cachesim).log_0I1Dr_name;
+	    helperAddr = TG_(cachesim).log_0I1Dr;
+	    argv = mkIRExprVec_3( i_node_expr,
+				  get_Event_dea(ev),
+				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
+	    regparms = 3;
+	    inew = i+1;
+	    break;
+	 case Ev_Dw:
+	 case Ev_Dm:
+	    /* Data write */
+	    helperName = TG_(cachesim).log_0I1Dw_name;
+	    helperAddr = TG_(cachesim).log_0I1Dw;
+	    argv = mkIRExprVec_3( i_node_expr,
+				  get_Event_dea(ev),
+				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
+	    regparms = 3;
+	    inew = i+1;
+	    break;
+         case Ev_Bc:
+            /* Conditional branch */
+            helperName = "log_cond_branch";
+            helperAddr = &log_cond_branch;
+            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
+            regparms = 2;
+            inew = i+1;
+            break;
+         case Ev_Bi:
+            /* Branch to an unknown destination */
+            helperName = "log_ind_branch";
+            helperAddr = &log_ind_branch;
+            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
+            regparms = 2;
+            inew = i+1;
+            break;
+         case Ev_G:
+            /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
+            helperName = "log_global_event";
+            helperAddr = &log_global_event;
+            argv = mkIRExprVec_1( i_node_expr );
+            regparms = 1;
+            inew = i+1;
+            break;
+	 default:
+	    tl_assert(0);
+      }
+
+      TG_DEBUGIF(5) {
+	  if (inew > i+1) {
+	      VG_(printf)("   merge ");
+	      showEvent( ev2 );
+	  }
+	  if (inew > i+2) {
+	      VG_(printf)("   merge ");
+	      showEvent( ev3 );
+	  }
+	  if (helperAddr)
+	      VG_(printf)("   call  %s (%p)\n",
+			  helperName, helperAddr);
+      }
+
+      /* helper could be unset depending on the simulator used */
+      if (helperAddr == 0) continue;
+
+      /* Add the helper. */
+      tl_assert(helperName);
+      tl_assert(helperAddr);
+      tl_assert(argv);
+      di = unsafeIRDirty_0_N( regparms,
+			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
+			      argv );
+      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+   }
+
+   clgs->events_used = 0;
+}
+
+static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
+{
+   Event* evt;
+   tl_assert(clgs->seen_before || (inode->eventset == 0));
+   if (!TG_(clo).simulate_cache) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag      = Ev_Ir;
+   evt->inode    = inode;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1);
+   if (!TG_(clo).simulate_cache) return;
+   tl_assert(datasize <= TG_(min_line_size));
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dr;
+   evt->inode     = inode;
+   evt->Ev.Dr.szB = datasize;
+   evt->Ev.Dr.ea  = ea;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1);
+   if (!TG_(clo).simulate_cache) return;
+   tl_assert(datasize <= TG_(min_line_size));
+
+   /* Is it possible to merge this write with the preceding read? */
+   if (clgs->events_used > 0) {
+      Event* lastEvt = &clgs->events[clgs->events_used-1];
+      if (   lastEvt->tag       == Ev_Dr
+          && lastEvt->Ev.Dr.szB == datasize
+          && lastEvt->inode     == inode
+          && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
+      {
+         lastEvt->tag   = Ev_Dm;
+         return;
+      }
+   }
+
+   /* No.  Add as normal. */
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dw;
+   evt->inode     = inode;
+   evt->Ev.Dw.szB = datasize;
+   evt->Ev.Dw.ea  = ea;
+   clgs->events_used++;
+}
+
+static
+void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
+                          Int datasize, IRAtom* ea, IRAtom* guard,
+                          Bool isWrite )
+{
+   tl_assert(isIRAtom(ea));
+   tl_assert(guard);
+   tl_assert(isIRAtom(guard));
+   tl_assert(datasize >= 1);
+   if (!TG_(clo).simulate_cache) return;
+   tl_assert(datasize <= TG_(min_line_size));
+
+   /* Adding guarded memory actions and merging them with the existing
+      queue is too complex.  Simply flush the queue and add this
+      action immediately.  Since guarded loads and stores are pretty
+      rare, this is not thought likely to cause any noticeable
+      performance loss as a result of the loss of event-merging
+      opportunities. */
+   tl_assert(clgs->events_used >= 0);
+   flushEvents(clgs);
+   tl_assert(clgs->events_used == 0);
+   /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
+   IRExpr*      i_node_expr;
+   const HChar* helperName;
+   void*        helperAddr;
+   IRExpr**     argv;
+   Int          regparms;
+   IRDirty*     di;
+   i_node_expr = mkIRExpr_HWord( (HWord)inode );
+   helperName  = isWrite ? TG_(cachesim).log_0I1Dw_name
+                         : TG_(cachesim).log_0I1Dr_name;
+   helperAddr  = isWrite ? TG_(cachesim).log_0I1Dw
+                         : TG_(cachesim).log_0I1Dr;
+   argv        = mkIRExprVec_3( i_node_expr,
+                                ea, mkIRExpr_HWord( datasize ) );
+   regparms    = 3;
+   di          = unsafeIRDirty_0_N(
+                    regparms, 
+                    helperName, VG_(fnptr_to_fnentry)( helperAddr ), 
+                    argv );
+   di->guard = guard;
+   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+}
+
+static
+void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
+{
+   Event* evt;
+   tl_assert(isIRAtom(guard));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
+             == (sizeof(RegWord)==4 ? Ity_I32 : Ity_I64));
+   if (!TG_(clo).simulate_branch) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag         = Ev_Bc;
+   evt->inode       = inode;
+   evt->Ev.Bc.taken = guard;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
+{
+   Event* evt;
+   tl_assert(isIRAtom(whereTo));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
+             == (sizeof(RegWord)==4 ? Ity_I32 : Ity_I64));
+   if (!TG_(clo).simulate_branch) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Bi;
+   evt->inode     = inode;
+   evt->Ev.Bi.dst = whereTo;
+   clgs->events_used++;
+}
+
+static
+void addEvent_G ( ClgState* clgs, InstrInfo* inode )
+{
+   Event* evt;
+   if (!TG_(clo).collect_bus) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_G;
+   evt->inode     = inode;
+   clgs->events_used++;
+}
+
+/* Initialise or check (if already seen before) an InstrInfo for next insn.
+   We only can set instr_offset/instr_size here. The required event set and
+   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
+   instructions. The event set is extended as required on flush of the event
+   queue (when Dm events were determined), cost offsets are determined at
+   end of BB instrumentation. */
+static
+InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
+{
+   InstrInfo* ii;
+   tl_assert(clgs->ii_index < clgs->bb->instr_count);
+   ii = &clgs->bb->instr[ clgs->ii_index ];
+
+   if (clgs->seen_before) {
+       TG_ASSERT(ii->instr_offset == clgs->instr_offset);
+       TG_ASSERT(ii->instr_size == instr_size);
+   }
+   else {
+       ii->instr_offset = clgs->instr_offset;
+       ii->instr_size = instr_size;
+       ii->cost_offset = 0;
+       ii->eventset = 0;
+   }
+
+   clgs->ii_index++;
+   clgs->instr_offset += instr_size;
+   TG_(stat).distinct_instrs++;
+
+   return ii;
+}
+
+// return total number of cost values needed for this BB
+static
+UInt update_cost_offsets( ClgState* clgs )
+{
+    Int i;
+    InstrInfo* ii;
+    UInt cost_offset = 0;
+
+    TG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
+    for(i=0; i<clgs->ii_index; i++) {
+	ii = &clgs->bb->instr[i];
+	if (clgs->seen_before) {
+	    TG_ASSERT(ii->cost_offset == cost_offset);
+	} else
+	    ii->cost_offset = cost_offset;
+	cost_offset += ii->eventset ? ii->eventset->size : 0;
+    }
+
+    return cost_offset;
+}
+
+/*------------------------------------------------------------*/
+/*--- Instrumentation                                      ---*/
+/*------------------------------------------------------------*/
+
+#if defined(VG_BIGENDIAN)
+# define CLGEndness Iend_BE
+#elif defined(VG_LITTLEENDIAN)
+# define CLGEndness Iend_LE
+#else
+# error "Unknown endianness"
+#endif
+
+static
+Addr IRConst2Addr(IRConst* con)
+{
+    Addr addr;
+
+    if (sizeof(RegWord) == 4) {
+	TG_ASSERT( con->tag == Ico_U32 );
+	addr = con->Ico.U32;
+    }
+    else if (sizeof(RegWord) == 8) {
+	TG_ASSERT( con->tag == Ico_U64 );
+	addr = con->Ico.U64;
+    }
+    else
+	VG_(tool_panic)("Tracegrind: invalid Addr type");
+
+    return addr;
+}
+
+/* First pass over a BB to instrument, counting instructions and jumps
+ * This is needed for the size of the BB struct to allocate
+ *
+ * Called from TG_(get_bb)
+ */
+void TG_(collectBlockInfo)(IRSB* sbIn,
+			    /*INOUT*/ UInt* instrs,
+			    /*INOUT*/ UInt* cjmps,
+			    /*INOUT*/ Bool* cjmp_inverted)
+{
+    Int i;
+    IRStmt* st;
+    Addr instrAddr =0, jumpDst;
+    UInt instrLen = 0;
+    Bool toNextInstr = False;
+
+    // Ist_Exit has to be ignored in preamble code, before first IMark:
+    // preamble code is added by VEX for self modifying code, and has
+    // nothing to do with client code
+    Bool inPreamble = True;
+
+    if (!sbIn) return;
+
+    for (i = 0; i < sbIn->stmts_used; i++) {
+	  st = sbIn->stmts[i];
+	  if (Ist_IMark == st->tag) {
+	      inPreamble = False;
+
+	      instrAddr = st->Ist.IMark.addr;
+	      instrLen  = st->Ist.IMark.len;
+
+	      (*instrs)++;
+	      toNextInstr = False;
+	  }
+	  if (inPreamble) continue;
+	  if (Ist_Exit == st->tag) {
+	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
+	      toNextInstr =  (jumpDst == instrAddr + instrLen);
+
+	      (*cjmps)++;
+	  }
+    }
+
+    /* if the last instructions of BB conditionally jumps to next instruction
+     * (= first instruction of next BB in memory), this is a inverted by VEX.
+     */
+    *cjmp_inverted = toNextInstr;
+}
+
+static
+void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
+{
+    addStmtToIRSB( bbOut,
+		   IRStmt_Store(CLGEndness,
+				IRExpr_Const(hWordTy == Ity_I32 ?
+					     IRConst_U32( addr ) :
+					     IRConst_U64( addr )),
+				IRExpr_Const(IRConst_U32(val)) ));
+}   
+
+
+/* add helper call to setup_bbcc, with pointer to BB struct as argument
+ *
+ * precondition for setup_bbcc:
+ * - jmps_passed has number of cond.jumps passed in last executed BB
+ * - current_bbcc has a pointer to the BBCC of the last executed BB
+ *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
+ *     current_bbcc->bb->jmp_addr
+ *   gives the address of the jump source.
+ *
+ * the setup does 2 things:
+ * - trace call:
+ *   * Unwind own call stack, i.e sync our ESP with real ESP
+ *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
+ *   * For CALLs or JMPs crossing objects, record call arg +
+ *     push are on own call stack
+ *
+ * - prepare for cache log functions:
+ *   set current_bbcc to BBCC that gets the costs for this BB execution
+ *   attached
+ */
+static
+void addBBSetupCall(ClgState* clgs)
+{
+   IRDirty* di;
+   IRExpr  *arg1, **argv;
+
+   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
+   argv = mkIRExprVec_1(arg1);
+   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
+			      VG_(fnptr_to_fnentry)( & TG_(setup_bbcc) ),
+			      argv);
+   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+}
+
+
+static
+IRSB* TG_(instrument)( VgCallbackClosure* closure,
+                        IRSB* sbIn,
+			const VexGuestLayout* layout,
+			const VexGuestExtents* vge,
+                        const VexArchInfo* archinfo_host,
+			IRType gWordTy, IRType hWordTy )
+{
+   Int        i;
+   IRStmt*    st;
+   Addr       origAddr;
+   InstrInfo* curr_inode = NULL;
+   ClgState   clgs;
+   UInt       cJumps = 0;
+   IRTypeEnv* tyenv = sbIn->tyenv;
+
+   if (gWordTy != hWordTy) {
+      /* We don't currently support this case. */
+      VG_(tool_panic)("host/guest word size mismatch");
+   }
+
+   // No instrumentation if it is switched off
+   if (! TG_(instrument_state)) {
+       TG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
+		 (Addr)closure->readdr);
+       return sbIn;
+   }
+
+   TG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
+
+   /* Set up SB for instrumented IR */
+   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
+
+   // Copy verbatim any IR preamble preceding the first IMark
+   i = 0;
+   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
+      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
+      i++;
+   }
+
+   // Get the first statement, and origAddr from it
+   TG_ASSERT(sbIn->stmts_used >0);
+   TG_ASSERT(i < sbIn->stmts_used);
+   st = sbIn->stmts[i];
+   TG_ASSERT(Ist_IMark == st->tag);
+
+   origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta;
+   TG_ASSERT(origAddr == st->Ist.IMark.addr 
+                          + st->Ist.IMark.delta);  // XXX: check no overflow
+
+   /* Get BB struct (creating if necessary).
+    * JS: The hash table is keyed with orig_addr_noredir -- important!
+    * JW: Why? If it is because of different chasing of the redirection,
+    *     this is not needed, as chasing is switched off in tracegrind
+    */
+   clgs.bb = TG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
+
+   addBBSetupCall(&clgs);
+
+   // Set up running state
+   clgs.events_used = 0;
+   clgs.ii_index = 0;
+   clgs.instr_offset = 0;
+
+   for (/*use current i*/; i < sbIn->stmts_used; i++) {
+
+      st = sbIn->stmts[i];
+      TG_ASSERT(isFlatIRStmt(st));
+
+      switch (st->tag) {
+	 case Ist_NoOp:
+	 case Ist_AbiHint:
+	 case Ist_Put:
+	 case Ist_PutI:
+	 case Ist_MBE:
+	    break;
+
+	 case Ist_IMark: {
+            Addr   cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
+            UInt   isize = st->Ist.IMark.len;
+            TG_ASSERT(clgs.instr_offset == cia - origAddr);
+	    // If Vex fails to decode an instruction, the size will be zero.
+	    // Pretend otherwise.
+	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
+
+	    // Sanity-check size.
+	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
+		     || VG_CLREQ_SZB == isize );
+
+	    // Init the inode, record it as the current one.
+	    // Subsequent Dr/Dw/Dm events from the same instruction will
+	    // also use it.
+	    curr_inode = next_InstrInfo (&clgs, isize);
+
+	    addEvent_Ir( &clgs, curr_inode );
+	    break;
+	 }
+
+	 case Ist_WrTmp: {
+	    IRExpr* data = st->Ist.WrTmp.data;
+	    if (data->tag == Iex_Load) {
+	       IRExpr* aexpr = data->Iex.Load.addr;
+	       // Note also, endianness info is ignored.  I guess
+	       // that's not interesting.
+	       addEvent_Dr( &clgs, curr_inode,
+			    sizeofIRType(data->Iex.Load.ty), aexpr );
+	    }
+	    break;
+	 }
+
+	 case Ist_Store: {
+	    IRExpr* data  = st->Ist.Store.data;
+	    IRExpr* aexpr = st->Ist.Store.addr;
+	    addEvent_Dw( &clgs, curr_inode,
+			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
+	    break;
+	 }
+
+         case Ist_StoreG: {
+            IRStoreG* sg   = st->Ist.StoreG.details;
+            IRExpr*   data = sg->data;
+            IRExpr*   addr = sg->addr;
+            IRType    type = typeOfIRExpr(tyenv, data);
+            tl_assert(type != Ity_INVALID);
+            addEvent_D_guarded( &clgs, curr_inode,
+                                sizeofIRType(type), addr, sg->guard,
+                                True/*isWrite*/ );
+            break;
+         }
+
+         case Ist_LoadG: {
+            IRLoadG* lg       = st->Ist.LoadG.details;
+            IRType   type     = Ity_INVALID; /* loaded type */
+            IRType   typeWide = Ity_INVALID; /* after implicit widening */
+            IRExpr*  addr     = lg->addr;
+            typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
+            tl_assert(type != Ity_INVALID);
+            addEvent_D_guarded( &clgs, curr_inode,
+                                sizeofIRType(type), addr, lg->guard,
+                                False/*!isWrite*/ );
+            break;
+         }
+
+	 case Ist_Dirty: {
+	    Int      dataSize;
+	    IRDirty* d = st->Ist.Dirty.details;
+	    if (d->mFx != Ifx_None) {
+	       /* This dirty helper accesses memory.  Collect the details. */
+	       tl_assert(d->mAddr != NULL);
+	       tl_assert(d->mSize != 0);
+	       dataSize = d->mSize;
+	       // Large (eg. 28B, 108B, 512B on x86) data-sized
+	       // instructions will be done inaccurately, but they're
+	       // very rare and this avoids errors from hitting more
+	       // than two cache lines in the simulation.
+	       if (TG_(clo).simulate_cache && dataSize > TG_(min_line_size))
+		  dataSize = TG_(min_line_size);
+	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
+	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
+	    } else {
+	       tl_assert(d->mAddr == NULL);
+	       tl_assert(d->mSize == 0);
+	    }
+	    break;
+	 }
+
+         case Ist_CAS: {
+            /* We treat it as a read and a write of the location.  I
+               think that is the same behaviour as it was before IRCAS
+               was introduced, since prior to that point, the Vex
+               front ends would translate a lock-prefixed instruction
+               into a (normal) read followed by a (normal) write. */
+            Int    dataSize;
+            IRCAS* cas = st->Ist.CAS.details;
+            TG_ASSERT(cas->addr && isIRAtom(cas->addr));
+            TG_ASSERT(cas->dataLo);
+            dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
+            if (cas->dataHi != NULL)
+               dataSize *= 2; /* since this is a doubleword-cas */
+            addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
+            addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
+            addEvent_G(  &clgs, curr_inode );
+            break;
+         }
+
+         case Ist_LLSC: {
+            IRType dataTy;
+            if (st->Ist.LLSC.storedata == NULL) {
+               /* LL */
+               dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
+               addEvent_Dr( &clgs, curr_inode,
+                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
+               /* flush events before LL, should help SC to succeed */
+               flushEvents( &clgs );
+            } else {
+               /* SC */
+               dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
+               addEvent_Dw( &clgs, curr_inode,
+                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
+               /* I don't know whether the global-bus-lock cost should
+                  be attributed to the LL or the SC, but it doesn't
+                  really matter since they always have to be used in
+                  pairs anyway.  Hence put it (quite arbitrarily) on
+                  the SC. */
+               addEvent_G(  &clgs, curr_inode );
+            }
+            break;
+         }
+
+ 	 case Ist_Exit: {
+            Bool guest_exit, inverted;
+
+            /* VEX code generation sometimes inverts conditional branches.
+             * As Tracegrind counts (conditional) jumps, it has to correct
+             * inversions. The heuristic is the following:
+             * (1) Tracegrind switches off SB chasing and unrolling, and
+             *     therefore it assumes that a candidate for inversion only is
+             *     the last conditional branch in an SB.
+             * (2) inversion is assumed if the branch jumps to the address of
+             *     the next guest instruction in memory.
+             * This heuristic is precalculated in TG_(collectBlockInfo)().
+             *
+             * Branching behavior is also used for branch prediction. Note that
+             * above heuristic is different from what Cachegrind does.
+             * Cachegrind uses (2) for all branches.
+             */
+            if (cJumps+1 == clgs.bb->cjmp_count)
+                inverted = clgs.bb->cjmp_inverted;
+            else
+                inverted = False;
+
+            // call branch predictor only if this is a branch in guest code
+            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
+                         (st->Ist.Exit.jk == Ijk_Call) ||
+                         (st->Ist.Exit.jk == Ijk_Ret);
+
+            if (guest_exit) {
+                /* Stuff to widen the guard expression to a host word, so
+                   we can pass it to the branch predictor simulation
+                   functions easily. */
+                IRType   tyW    = hWordTy;
+                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
+                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
+                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
+                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
+                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
+                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+                                               : IRExpr_Const(IRConst_U64(1));
+
+                /* Widen the guard expression. */
+                addStmtToIRSB( clgs.sbOut,
+                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
+                addStmtToIRSB( clgs.sbOut,
+                               IRStmt_WrTmp( guardW,
+                                             IRExpr_Unop(widen,
+                                                         IRExpr_RdTmp(guard1))) );
+                /* If the exit is inverted, invert the sense of the guard. */
+                addStmtToIRSB(
+                        clgs.sbOut,
+                        IRStmt_WrTmp(
+                                guard,
+                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+                                    : IRExpr_RdTmp(guardW)
+                                    ));
+                /* And post the event. */
+                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
+            }
+
+	    /* We may never reach the next statement, so need to flush
+	       all outstanding transactions now. */
+	    flushEvents( &clgs );
+
+	    TG_ASSERT(clgs.ii_index>0);
+	    if (!clgs.seen_before) {
+	      TgJumpKind jk;
+
+	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
+	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
+	      else {
+		if (IRConst2Addr(st->Ist.Exit.dst) ==
+		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
+		  jk = jk_None;
+		else
+		  jk = jk_Jump;
+	      }
+
+	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
+	      clgs.bb->jmp[cJumps].jmpkind = jk;
+	    }
+
+	    /* Update global variable jmps_passed before the jump
+	     * A correction is needed if VEX inverted the last jump condition
+	    */
+	    UInt val = inverted ? cJumps+1 : cJumps;
+	    addConstMemStoreStmt( clgs.sbOut,
+				  (UWord) &TG_(current_state).jmps_passed,
+				  val, hWordTy);
+	    cJumps++;
+
+	    break;
+	 }
+
+	 default:
+	    tl_assert(0);
+	    break;
+      }
+
+      /* Copy the original statement */
+      addStmtToIRSB( clgs.sbOut, st );
+
+      TG_DEBUGIF(5) {
+	 VG_(printf)("   pass  ");
+	 ppIRStmt(st);
+	 VG_(printf)("\n");
+      }
+   }
+
+   /* Deal with branches to unknown destinations.  Except ignore ones
+      which are function returns as we assume the return stack
+      predictor never mispredicts. */
+   if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
+      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
+      switch (sbIn->next->tag) {
+         case Iex_Const:
+            break; /* boring - branch to known address */
+         case Iex_RdTmp:
+            /* looks like an indirect branch (branch to unknown) */
+            addEvent_Bi( &clgs, curr_inode, sbIn->next );
+            break;
+         default:
+            /* shouldn't happen - if the incoming IR is properly
+               flattened, should only have tmp and const cases to
+               consider. */
+            tl_assert(0);
+      }
+   }
+
+   /* At the end of the bb.  Flush outstandings. */
+   flushEvents( &clgs );
+
+   /* Update global variable jmps_passed at end of SB.
+    * As TG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
+    * this can be omitted if there is no conditional jump in this SB.
+    * A correction is needed if VEX inverted the last jump condition
+    */
+   if (cJumps>0) {
+      UInt jmps_passed = cJumps;
+      if (clgs.bb->cjmp_inverted) jmps_passed--;
+      addConstMemStoreStmt( clgs.sbOut,
+			    (UWord) &TG_(current_state).jmps_passed,
+			    jmps_passed, hWordTy);
+   }
+   TG_ASSERT(clgs.bb->cjmp_count == cJumps);
+   TG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
+
+   /* Info for final exit from BB */
+   {
+     TgJumpKind jk;
+
+     if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
+     else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
+     else {
+       jk = jk_Jump;
+       if ((sbIn->next->tag == Iex_Const) &&
+	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
+	    origAddr + clgs.instr_offset))
+	 jk = jk_None;
+     }
+     clgs.bb->jmp[cJumps].jmpkind = jk;
+     /* Instruction index of the call/ret at BB end
+      * (it is wrong for fall-through, but does not matter) */
+     clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
+   }
+
+   /* swap information of last exit with final exit if inverted */
+   if (clgs.bb->cjmp_inverted) {
+     TgJumpKind jk;
+     UInt instr;
+
+     jk = clgs.bb->jmp[cJumps].jmpkind;
+     clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
+     clgs.bb->jmp[cJumps-1].jmpkind = jk;
+     instr = clgs.bb->jmp[cJumps].instr;
+     clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
+     clgs.bb->jmp[cJumps-1].instr = instr;
+   }
+
+   if (clgs.seen_before) {
+       TG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
+       TG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
+   }
+   else {
+       clgs.bb->cost_count = update_cost_offsets(&clgs);
+       clgs.bb->instr_len = clgs.instr_offset;
+   }
+
+   TG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
+	     origAddr, clgs.bb->instr_len,
+	     clgs.bb->cjmp_count, clgs.bb->cost_count);
+   if (cJumps>0) {
+       TG_DEBUG(3, "                     [ ");
+       for (i=0;i<cJumps;i++)
+	   TG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr);
+       TG_DEBUG(3, "], last inverted: %s \n",
+		 clgs.bb->cjmp_inverted ? "yes":"no");
+   }
+
+  return clgs.sbOut;
+}
+
+/*--------------------------------------------------------------------*/
+/*--- Discarding BB info                                           ---*/
+/*--------------------------------------------------------------------*/
+
+// Called when a translation is removed from the translation cache for
+// any reason at all: to free up space, because the guest code was
+// unmapped or modified, or for any arbitrary reason.
+static
+void tg_discard_superblock_info ( Addr orig_addr, VexGuestExtents vge )
+{
+    tl_assert(vge.n_used > 0);
+
+   if (0)
+      VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
+                   (void*)orig_addr,
+                   (void*)vge.base[0], (ULong)vge.len[0]);
+
+   // Get BB info, remove from table, free BB info.  Simple!
+   // When created, the BB is keyed by the first instruction address,
+   // (not orig_addr, but eventually redirected address). Thus, we
+   // use the first instruction address in vge.
+   TG_(delete_bb)(vge.base[0]);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- TG_(fini)() and related function                     ---*/
+/*------------------------------------------------------------*/
+
+
+
+static void zero_thread_cost(thread_info* t)
+{
+  Int i;
+
+  for(i = 0; i < TG_(current_call_stack).sp; i++) {
+    if (!TG_(current_call_stack).entry[i].jcc) continue;
+
+    /* reset call counters to current for active calls */
+    TG_(copy_cost)( TG_(sets).full, 
+		    TG_(current_call_stack).entry[i].enter_cost,
+		    TG_(current_state).cost );
+    TG_(current_call_stack).entry[i].jcc->call_counter = 0;
+  }
+
+  TG_(forall_bbccs)(TG_(zero_bbcc));
+
+  /* set counter for last dump */
+  TG_(copy_cost)( TG_(sets).full, 
+		  t->lastdump_cost, TG_(current_state).cost );
+}
+
+void TG_(zero_all_cost)(Bool only_current_thread)
+{
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
+
+  if (only_current_thread)
+    zero_thread_cost(TG_(get_current_thread)());
+  else
+    TG_(forall_threads)(zero_thread_cost);
+
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "  ...done\n");
+}
+
+static
+void unwind_thread(thread_info* t)
+{
+  /* unwind signal handlers */
+  while(TG_(current_state).sig !=0)
+    TG_(post_signal)(TG_(current_tid),TG_(current_state).sig);
+
+  /* unwind regular call stack */
+  while(TG_(current_call_stack).sp>0)
+    TG_(pop_call_stack)();
+
+  /* reset context and function stack for context generation */
+  TG_(init_exec_state)( &TG_(current_state) );
+  TG_(current_fn_stack).top = TG_(current_fn_stack).bottom;
+}
+
+static
+void zero_state_cost(thread_info* t)
+{
+    TG_(zero_cost)( TG_(sets).full, TG_(current_state).cost );
+}
+
+void TG_(set_instrument_state)(const HChar* reason, Bool state)
+{
+  if (TG_(instrument_state) == state) {
+    TG_DEBUG(2, "%s: instrumentation already %s\n",
+	     reason, state ? "ON" : "OFF");
+    return;
+  }
+  TG_(instrument_state) = state;
+  TG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
+	   reason, state ? "ON" : "OFF");
+
+  VG_(discard_translations_safely)( (Addr)0x1000, ~(SizeT)0xfff, "tracegrind");
+
+  /* reset internal state: call stacks, simulator */
+  TG_(forall_threads)(unwind_thread);
+  TG_(forall_threads)(zero_state_cost);
+  (*TG_(cachesim).clear)();
+
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
+		 reason, state ? "ON" : "OFF");
+}
+
+/* helper for dump_state_togdb */
+static void dump_state_of_thread_togdb(thread_info* ti)
+{
+    static FullCost sum = 0, tmp = 0;
+    Int t, i;
+    BBCC *from, *to;
+    call_entry* ce;
+    HChar *mcost;
+
+    t = TG_(current_tid);
+    TG_(init_cost_lz)( TG_(sets).full, &sum );
+    TG_(copy_cost_lz)( TG_(sets).full, &tmp, ti->lastdump_cost );
+    TG_(add_diff_cost)( TG_(sets).full, sum, ti->lastdump_cost,
+			 ti->states.entry[0]->cost);
+    TG_(copy_cost)( TG_(sets).full, ti->lastdump_cost, tmp );
+    mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
+    VG_(gdb_printf)("events-%d: %s\n", t, mcost);
+    VG_(free)(mcost);
+    VG_(gdb_printf)("frames-%d: %d\n", t, TG_(current_call_stack).sp);
+
+    ce = 0;
+    for(i = 0; i < TG_(current_call_stack).sp; i++) {
+      ce = TG_(get_call_entry)(i);
+      /* if this frame is skipped, we don't have counters */
+      if (!ce->jcc) continue;
+      
+      from = ce->jcc->from;
+      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
+      VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
+      
+      /* FIXME: EventSets! */
+      TG_(copy_cost)( TG_(sets).full, sum, ce->jcc->cost );
+      TG_(copy_cost)( TG_(sets).full, tmp, ce->enter_cost );
+      TG_(add_diff_cost)( TG_(sets).full, sum,
+			  ce->enter_cost, TG_(current_state).cost );
+      TG_(copy_cost)( TG_(sets).full, ce->enter_cost, tmp );
+      
+      mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
+      VG_(gdb_printf)("events-%d-%d: %s\n",t, i, mcost);
+      VG_(free)(mcost);
+    }
+    if (ce && ce->jcc) {
+      to = ce->jcc->to;
+      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
+    }
+}
+
+/* Dump current state */
+static void dump_state_togdb(void)
+{
+    thread_info** th;
+    int t;
+    Int orig_tid = TG_(current_tid);
+
+    VG_(gdb_printf)("instrumentation: %s\n",
+		    TG_(instrument_state) ? "on":"off");
+    if (!TG_(instrument_state)) return;
+
+    VG_(gdb_printf)("executed-bbs: %llu\n", TG_(stat).bb_executions);
+    VG_(gdb_printf)("executed-calls: %llu\n", TG_(stat).call_counter);
+    VG_(gdb_printf)("distinct-bbs: %d\n", TG_(stat).distinct_bbs);
+    VG_(gdb_printf)("distinct-calls: %d\n", TG_(stat).distinct_jccs);
+    VG_(gdb_printf)("distinct-functions: %d\n", TG_(stat).distinct_fns);
+    VG_(gdb_printf)("distinct-contexts: %d\n", TG_(stat).distinct_contexts);
+
+    /* "events:" line. Given here because it will be dynamic in the future */
+    HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+    VG_(gdb_printf)("events: %s\n", evmap);
+    VG_(free)(evmap);
+    /* "part:" line (number of last part. Is 0 at start */
+    VG_(gdb_printf)("part: %d\n", TG_(get_dump_counter)());
+		
+    /* threads */
+    th = TG_(get_threads)();
+    VG_(gdb_printf)("threads:");
+    for(t=1;t<VG_N_THREADS;t++) {
+	if (!th[t]) continue;
+	VG_(gdb_printf)(" %d", t);
+    }
+    VG_(gdb_printf)("\n");
+    VG_(gdb_printf)("current-tid: %d\n", orig_tid);
+    TG_(forall_threads)(dump_state_of_thread_togdb);
+}
+
+  
+static void print_monitor_help ( void )
+{
+   VG_(gdb_printf) ("\n");
+   VG_(gdb_printf) ("tracegrind monitor commands:\n");
+   VG_(gdb_printf) ("  dump [<dump_hint>]\n");
+   VG_(gdb_printf) ("        dump counters\n");
+   VG_(gdb_printf) ("  zero\n");
+   VG_(gdb_printf) ("        zero counters\n");
+   VG_(gdb_printf) ("  status\n");
+   VG_(gdb_printf) ("        print status\n");
+   VG_(gdb_printf) ("  instrumentation [on|off]\n");
+   VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
+   VG_(gdb_printf) ("\n");
+}
+
+/* return True if request recognised, False otherwise */
+static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
+{
+   HChar* wcmd;
+   HChar s[VG_(strlen)(req) + 1]; /* copy for strtok_r */
+   HChar *ssaveptr;
+
+   VG_(strcpy) (s, req);
+
+   wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
+   switch (VG_(keyword_id) ("help dump zero status instrumentation", 
+                            wcmd, kwd_report_duplicated_matches)) {
+   case -2: /* multiple matches */
+      return True;
+   case -1: /* not found */
+      return False;
+   case  0: /* help */
+      print_monitor_help();
+      return True;
+   case  1: { /* dump */
+      TG_(dump_profile)(req, False);
+      return True;
+   }
+   case  2: { /* zero */
+      TG_(zero_all_cost)(False);
+      return True;
+   }
+
+   case 3: { /* status */
+     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
+     if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
+       /* internal interface to tracegrind_control */
+       dump_state_togdb();
+       return True;
+     }
+
+     if (!TG_(instrument_state)) {
+       VG_(gdb_printf)("No status available as instrumentation is switched off\n");
+     } else {
+       // Status information to be improved ...
+       thread_info** th = TG_(get_threads)();
+       Int t, tcount = 0;
+       for(t=1;t<VG_N_THREADS;t++)
+	 if (th[t]) tcount++;
+       VG_(gdb_printf)("%d thread(s) running.\n", tcount);
+     }
+     return True;
+   }
+
+   case 4: { /* instrumentation */
+     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
+     if (!arg) {
+       VG_(gdb_printf)("instrumentation: %s\n",
+		       TG_(instrument_state) ? "on":"off");
+     }
+     else
+       TG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
+     return True;
+   }
+
+   default: 
+      tl_assert(0);
+      return False;
+   }
+}
+
+static
+Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
+{
+   if (!VG_IS_TOOL_USERREQ('C','T',args[0])
+       && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
+      return False;
+
+   switch(args[0]) {
+   case VG_USERREQ__DUMP_STATS:     
+      TG_(dump_profile)("Client Request", True);
+      *ret = 0;                 /* meaningless */
+      break;
+
+   case VG_USERREQ__DUMP_STATS_AT:
+     {
+       const HChar *arg = (HChar*)args[1];
+       HChar buf[30 + VG_(strlen)(arg)];    // large enough
+       VG_(sprintf)(buf,"Client Request: %s", arg);
+       TG_(dump_profile)(buf, True);
+       *ret = 0;                 /* meaningless */
+     }
+     break;
+
+   case VG_USERREQ__ZERO_STATS:
+     TG_(zero_all_cost)(True);
+      *ret = 0;                 /* meaningless */
+      break;
+
+   case VG_USERREQ__TOGGLE_COLLECT:
+     TG_(current_state).collect = !TG_(current_state).collect;
+     TG_DEBUG(2, "Client Request: toggled collection state to %s\n",
+	      TG_(current_state).collect ? "ON" : "OFF");
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__START_INSTRUMENTATION:
+     TG_(set_instrument_state)("Client Request", True);
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__STOP_INSTRUMENTATION:
+     TG_(set_instrument_state)("Client Request", False);
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__GDB_MONITOR_COMMAND: {
+      Bool handled = handle_gdb_monitor_command (tid, (HChar*)args[1]);
+      if (handled)
+         *ret = 1;
+      else
+         *ret = 0;
+      return handled;
+   }
+   default:
+      VG_(message)(Vg_UserMsg,
+                   "Warning: unknown tracegrind client request code %llx\n",
+                   (ULong)args[0]);
+      return False;
+   }
+
+   return True;
+}
+
+
+/* Syscall Timing.  syscalltime[tid] is the time at which thread tid last
+   started a syscall.  */
+
+/* struct vki_timespec syscalltime[VG_N_THREADS];
+   Whatever the syscall we use to measure the syscall time, we convert to
+   seconds and nanoseconds.  */
+struct vki_timespec *syscalltime;
+struct vki_timespec *syscallcputime;
+
+
+static
+void collect_time (struct vki_timespec *systime, struct vki_timespec *syscputime)
+{
+  switch (TG_(clo).collect_systime) {
+    default: tl_assert (0);
+    case systime_msec: {
+      UInt ms_timer = VG_(read_millisecond_timer)();
+      systime->tv_sec = ms_timer / 1000;
+      systime->tv_nsec = (ms_timer % 1000) * 1000000L;
+      break;
+    }
+    case systime_usec: {
+      struct vki_timeval tv_now;
+      VG_(gettimeofday)(&tv_now, NULL);
+      systime->tv_sec = tv_now.tv_sec;
+      systime->tv_nsec = tv_now.tv_usec * 1000;
+      break;
+    }
+   case systime_nsec:
+#  if defined(VGO_linux) || defined(VGO_solaris) || defined(VGO_freebsd)
+      VG_(clock_gettime)(systime, VKI_CLOCK_MONOTONIC);
+      VG_(clock_gettime)(syscputime, VKI_CLOCK_THREAD_CPUTIME_ID);
+
+#  elif defined(VGO_darwin)
+      tl_assert(0);
+#  else
+#     error "Unknown OS"
+#  endif
+      break;
+  }
+}
+
+static
+void TG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
+                           UWord* args, UInt nArgs)
+{
+  collect_time(&syscalltime[tid],
+               TG_(clo).collect_systime == systime_nsec ? &syscallcputime[tid] : NULL);
+}
+
+/* Returns "after - before" in the unit as specified by --collect-systime.
+   after is supposed to be >= before, and tv_nsec must be >= 0 and < One_Second_In_Nsec.  */
+static
+ULong vki_timespec_diff (struct vki_timespec after, struct vki_timespec before)
+{
+   vki_time_t diff_sec = after.tv_sec - before.tv_sec;
+   long diff_nsec = after.tv_nsec - before.tv_nsec;
+   ULong nsec_factor; // factor to convert the desired unit into nsec.
+
+   if (diff_nsec < 0) {
+      diff_sec--;
+      diff_nsec += 1000000000ULL;
+   }
+  switch (TG_(clo).collect_systime) {
+    case systime_no: tl_assert (0);
+    case systime_msec: nsec_factor = 1000000ULL; break;
+    case systime_usec: nsec_factor = 1000ULL; break;
+    case systime_nsec: nsec_factor = 1ULL; break;
+    default: tl_assert(0);
+  }
+  return ((ULong) diff_sec * 1000000000ULL + diff_nsec) / nsec_factor;
+}
+
+static
+void TG_(post_syscalltime)(ThreadId tid, UInt syscallno,
+                            UWord* args, UInt nArgs, SysRes res)
+{
+  if (TG_(current_state).bbcc) {
+    Int o;
+    struct vki_timespec ts_now;
+    struct vki_timespec ts_cpunow;
+    ULong diff;
+
+    collect_time(&ts_now,
+                 TG_(clo).collect_systime == systime_nsec ? &ts_cpunow : NULL);
+
+    diff = vki_timespec_diff (ts_now, syscalltime[tid]);
+
+    /* offset o is for "SysCount", o+1 for "SysTime",
+       o+2 is (optionally) "SysCpuTime".  */
+    o = fullOffset(EG_SYS);
+    TG_ASSERT(o>=0);
+    TG_DEBUG(0,"   Time (Off %d) for Syscall %u: %llu\n", o, syscallno,
+              diff);
+
+    if (!TG_(current_state).bbcc->skipped)
+      TG_(init_cost_lz)(TG_(sets).full,
+			&(TG_(current_state).bbcc->skipped));
+    TG_(current_state).cost[o] ++;
+    TG_(current_state).cost[o+1] += diff;
+    TG_(current_state).bbcc->skipped[o] ++;
+    TG_(current_state).bbcc->skipped[o+1] += diff;
+    if (TG_(clo).collect_systime == systime_nsec) {
+      diff = vki_timespec_diff (ts_cpunow, syscallcputime[tid]);
+      TG_DEBUG(0,"   SysCpuTime (Off %d) for Syscall %u: %llu\n", o+2, syscallno,
+                diff);
+      TG_(current_state).cost[o+2] += diff;
+      TG_(current_state).bbcc->skipped[o+2] += diff;
+    }
+  }
+}
+
+static UInt ULong_width(ULong n)
+{
+   UInt w = 0;
+   while (n > 0) {
+      n = n / 10;
+      w++;
+   }
+   if (w == 0) w = 1;
+   return w + (w-1)/3;   // add space for commas
+}
+
+static
+void branchsim_printstat(int l1, int l2, int l3)
+{
+    static HChar fmt[128];    // large enough
+    FullCost total;
+    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
+    ULong B_total_b, B_total_mp;
+
+    total = TG_(total_cost);
+    Bc_total_b  = total[ fullOffset(EG_BC)   ];
+    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
+    Bi_total_b  = total[ fullOffset(EG_BI)   ];
+    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
+
+    /* Make format string, getting width right for numbers */
+    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
+                 l1, l2, l3);
+
+    if (0 == Bc_total_b)  Bc_total_b = 1;
+    if (0 == Bi_total_b)  Bi_total_b = 1;
+    B_total_b  = Bc_total_b  + Bi_total_b;
+    B_total_mp = Bc_total_mp + Bi_total_mp;
+
+    VG_(umsg)("\n");
+    VG_(umsg)(fmt, "Branches:     ",
+              B_total_b, Bc_total_b, Bi_total_b);
+
+    VG_(umsg)(fmt, "Mispredicts:  ",
+              B_total_mp, Bc_total_mp, Bi_total_mp);
+
+    VG_(umsg)("Mispred rate:  %*.1f%% (%*.1f%%     + %*.1f%%   )\n",
+              l1, B_total_mp  * 100.0 / B_total_b,
+              l2, Bc_total_mp * 100.0 / Bc_total_b,
+              l3, Bi_total_mp * 100.0 / Bi_total_b);
+}
+
+static
+void tg_print_stats(void)
+{
+   int BB_lookups =
+     TG_(stat).full_debug_BBs +
+     TG_(stat).fn_name_debug_BBs +
+     TG_(stat).file_line_debug_BBs +
+     TG_(stat).no_debug_BBs;
+
+   /* Hash table stats */
+   VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
+		TG_(stat).distinct_objs);
+   VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
+		TG_(stat).distinct_files);
+   VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
+		TG_(stat).distinct_fns);
+   VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
+		TG_(stat).distinct_contexts);
+   VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
+		TG_(stat).distinct_bbs);
+   VG_(message)(Vg_DebugMsg, "Cost entries:     %u (Chunks %u)\n",
+		TG_(costarray_entries), TG_(costarray_chunks));
+   VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
+		TG_(stat).distinct_bbccs);
+   VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
+		TG_(stat).distinct_jccs);
+   VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
+		TG_(stat).distinct_skips);
+   VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
+		BB_lookups);
+   if (BB_lookups>0) {
+      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
+		   TG_(stat).full_debug_BBs    * 100 / BB_lookups,
+		   TG_(stat).full_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
+		   TG_(stat).file_line_debug_BBs * 100 / BB_lookups,
+		   TG_(stat).file_line_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
+		   TG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
+		   TG_(stat).fn_name_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
+		   TG_(stat).no_debug_BBs      * 100 / BB_lookups,
+		   TG_(stat).no_debug_BBs);
+   }
+   VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
+		TG_(stat).bbcc_clones);
+   VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
+		TG_(stat).bb_retranslations);
+   VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
+		TG_(stat).distinct_instrs);
+
+   VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
+		TG_(stat).cxt_lru_misses);
+   VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
+		TG_(stat).bbcc_lru_misses);
+   VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
+		TG_(stat).jcc_lru_misses);
+   VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
+		TG_(stat).bb_executions);
+   VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
+		TG_(stat).call_counter);
+   VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
+		TG_(stat).jcnd_counter);
+   VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
+		TG_(stat).jump_counter);
+   VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
+		TG_(stat).rec_call_counter);
+   VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
+		TG_(stat).ret_counter);
+}
+
+
+static
+void finish(void)
+{
+  HChar fmt[128];    // large enough
+  Int l1, l2, l3;
+  FullCost total;
+
+  TG_DEBUG(0, "finish()\n");
+
+  (*TG_(cachesim).finish)();
+
+  /* pop all remaining items from CallStack for correct sum
+   */
+  TG_(forall_threads)(unwind_thread);
+
+  TG_(dump_profile)(0, False);
+
+  if (VG_(clo_verbosity) == 0) return;
+  
+  if (VG_(clo_stats)) {
+    VG_(message)(Vg_DebugMsg, "\n");
+    tg_print_stats();
+    VG_(message)(Vg_DebugMsg, "\n");
+  }
+
+  HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+  VG_(message)(Vg_UserMsg, "Events    : %s\n", evmap);
+  VG_(free)(evmap);
+  HChar *mcost = TG_(mappingcost_as_string)(TG_(dumpmap), TG_(total_cost));
+  VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost);
+  VG_(free)(mcost);
+  VG_(message)(Vg_UserMsg, "\n");
+
+  /* determine value widths for statistics */
+  total = TG_(total_cost);
+  l1 = ULong_width( total[fullOffset(EG_IR)] );
+  l2 = l3 = 0;
+  if (TG_(clo).simulate_cache) {
+      l2 = ULong_width( total[fullOffset(EG_DR)] );
+      l3 = ULong_width( total[fullOffset(EG_DW)] );
+  }
+  if (TG_(clo).simulate_branch) {
+      int l2b = ULong_width( total[fullOffset(EG_BC)] );
+      int l3b = ULong_width( total[fullOffset(EG_BI)] );
+      if (l2b > l2) l2 = l2b;
+      if (l3b > l3) l3 = l3b;
+  }
+
+  /* Make format string, getting width right for numbers */
+  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
+
+  /* Always print this */
+  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
+
+  if (TG_(clo).simulate_cache)
+      (*TG_(cachesim).printstat)(l1, l2, l3);
+
+  if (TG_(clo).simulate_branch)
+      branchsim_printstat(l1, l2, l3);
+
+}
+
+
+void TG_(fini)(Int exitcode)
+{
+  finish();
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- Setup                                                        ---*/
+/*--------------------------------------------------------------------*/
+
+static void tg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
+{
+   static ULong last_blocks_done = 0;
+
+   if (0)
+      VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
+
+   /* throttle calls to TG_(run_thread) by number of BBs executed */
+   if (blocks_done - last_blocks_done < 5000) return;
+   last_blocks_done = blocks_done;
+
+   TG_(run_thread)( tid );
+}
+
+static
+void TG_(post_clo_init)(void)
+{
+   if (VG_(clo_vex_control).iropt_register_updates_default
+       != VexRegUpdSpAtMemAccess) {
+      TG_DEBUG(1, " Using user specified value for "
+                "--vex-iropt-register-updates\n");
+   } else {
+      TG_DEBUG(1, 
+                " Using default --vex-iropt-register-updates="
+                "sp-at-mem-access\n");
+   }
+
+   if (TG_(clo).collect_systime != systime_no) {
+      VG_(needs_syscall_wrapper)(TG_(pre_syscalltime),
+                                 TG_(post_syscalltime));
+      syscalltime = TG_MALLOC("cl.main.pci.1",
+                               VG_N_THREADS * sizeof syscalltime[0]);
+      for (UInt i = 0; i < VG_N_THREADS; ++i) {
+         syscalltime[i].tv_sec = 0;
+         syscalltime[i].tv_nsec = 0;
+      }
+      if (TG_(clo).collect_systime == systime_nsec) {
+         syscallcputime = TG_MALLOC("cl.main.pci.2",
+                                     VG_N_THREADS * sizeof syscallcputime[0]);
+         for (UInt i = 0; i < VG_N_THREADS; ++i) {
+            syscallcputime[i].tv_sec = 0;
+            syscallcputime[i].tv_nsec = 0;
+         }
+      }
+   }
+
+   if (VG_(clo_px_file_backed) != VexRegUpdSpAtMemAccess) {
+      TG_DEBUG(1, " Using user specified value for "
+                "--px-file-backed\n");
+   } else {
+      TG_DEBUG(1, 
+                " Using default --px-file-backed="
+                "sp-at-mem-access\n");
+   }
+
+   if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
+      VG_(message)(Vg_UserMsg, 
+                   "tracegrind only works with --vex-iropt-unroll-thresh=0\n"
+                   "=> resetting it back to 0\n");
+      VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
+   }
+   if (VG_(clo_vex_control).guest_chase) {
+      VG_(message)(Vg_UserMsg,
+                   "tracegrind only works with --vex-guest-chase=no\n"
+                   "=> resetting it back to 'no'\n");
+      VG_(clo_vex_control).guest_chase = False; // cannot be overridden.
+   }
+   
+   TG_DEBUG(1, "  dump threads: %s\n", TG_(clo).separate_threads ? "Yes":"No");
+   TG_DEBUG(1, "  call sep. : %d\n", TG_(clo).separate_callers);
+   TG_DEBUG(1, "  rec. sep. : %d\n", TG_(clo).separate_recursions);
+
+   if (!TG_(clo).dump_line && !TG_(clo).dump_instr && !TG_(clo).dump_bb) {
+       VG_(message)(Vg_UserMsg, "Using source line as position.\n");
+       TG_(clo).dump_line = True;
+   }
+
+   TG_(init_dumps)();
+
+   (*TG_(cachesim).post_clo_init)();
+
+   TG_(init_eventsets)();
+   TG_(init_statistics)(& TG_(stat));
+   TG_(init_cost_lz)( TG_(sets).full, &TG_(total_cost) );
+
+   /* initialize hash tables */
+   TG_(init_obj_table)();
+   TG_(init_cxt_table)();
+   TG_(init_bb_hash)();
+
+   TG_(init_threads)();
+   TG_(run_thread)(1);
+
+   TG_(instrument_state) = TG_(clo).instrument_atstart;
+
+   if (VG_(clo_verbosity) > 0) {
+      VG_(message)(Vg_UserMsg,
+                   "For interactive control, run 'tracegrind_control%s%s -h'.\n",
+                   (VG_(arg_vgdb_prefix) ? " " : ""),
+                   (VG_(arg_vgdb_prefix) ? VG_(arg_vgdb_prefix) : ""));
+   }
+}
+
+static
+void TG_(pre_clo_init)(void)
+{
+    VG_(details_name)            ("Tracegrind");
+    VG_(details_version)         (NULL);
+    VG_(details_description)     ("a streaming CSV trace cache profiler");
+    VG_(details_copyright_author)("Copyright (C) 2026, and GNU GPL'd, "
+				  "by CodSpeed Technology SAS. "
+				  "Based on Callgrind by Josef Weidendorfer et al.");
+    VG_(details_bug_reports_to)  (VG_BUGS_TO);
+    VG_(details_avg_translation_sizeB) ( 500 );
+
+    VG_(clo_vex_control).iropt_register_updates_default
+       = VG_(clo_px_file_backed)
+       = VexRegUpdSpAtMemAccess; // overridable by the user.
+
+    VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
+    VG_(clo_vex_control).guest_chase = False;       // cannot be overridden.
+
+    VG_(basic_tool_funcs)        (TG_(post_clo_init),
+                                  TG_(instrument),
+                                  TG_(fini));
+
+    VG_(needs_superblock_discards)(tg_discard_superblock_info);
+
+
+    VG_(needs_command_line_options)(TG_(process_cmd_line_option),
+				    TG_(print_usage),
+				    TG_(print_debug_usage));
+
+    VG_(needs_client_requests)(TG_(handle_client_request));
+    VG_(needs_print_stats)    (tg_print_stats);
+
+    VG_(track_start_client_code)  ( & tg_start_client_code_callback );
+    VG_(track_pre_deliver_signal) ( & TG_(pre_signal) );
+    VG_(track_post_deliver_signal)( & TG_(post_signal) );
+
+    TG_(set_clo_defaults)();
+
+}
+
+VG_DETERMINE_INTERFACE_VERSION(TG_(pre_clo_init))
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                   main.c ---*/
+/*--------------------------------------------------------------------*/
diff --git a/tracegrind/sim.c b/tracegrind/sim.c
new file mode 100644
index 000000000..68e6fa84a
--- /dev/null
+++ b/tracegrind/sim.c
@@ -0,0 +1,1739 @@
+/*--------------------------------------------------------------------*/
+/*--- Cache simulation.                                            ---*/
+/*---                                                        sim.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+
+/* Notes:
+  - simulates a write-allocate cache
+  - (block --> set) hash function uses simple bit selection
+  - handling of references straddling two cache blocks:
+      - counts as only one cache access (not two)
+      - both blocks hit                  --> one hit
+      - one block hits, the other misses --> one miss
+      - both blocks miss                 --> one miss (not two)
+*/
+
+/* Cache configuration */
+#include "cg_arch.c"
+
+/* additional structures for cache use info, separated
+ * according usage frequency:
+ * - line_loaded : pointer to cost center of instruction 
+ *                 which loaded the line into cache.
+ *                 Needed to increment counters when line is evicted.
+ * - line_use    : updated on every access
+ */
+typedef struct {
+  UInt count;
+  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
+} line_use;
+
+typedef struct {
+  Addr memline, iaddr;
+  line_use* dep_use; /* point to higher-level cacheblock for this memline */
+  ULong* use_base;
+} line_loaded;  
+
+/* Cache state */
+typedef struct {
+   const HChar* name;
+   int          size;                   /* bytes */
+   int          assoc;
+   int          line_size;              /* bytes */
+   Bool         sectored;  /* prefetch nearside cacheline on read */
+   int          sets;
+   int          sets_min_1;
+   int          line_size_bits;
+   int          tag_shift;
+   UWord        tag_mask;
+   HChar        desc_line[128];    // large enough
+   UWord*       tags;
+
+  /* for cache use */
+   int          line_size_mask;
+   int*         line_start_mask;
+   int*         line_end_mask;
+   line_loaded* loaded;
+   line_use*    use;
+} cache_t2;
+
+/*
+ * States of flat caches in our model.
+ * We use a 2-level hierarchy, 
+ */
+static cache_t2 I1, D1, LL;
+
+/* Lower bits of cache tags are used as flags for a cache line */
+#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
+#define CACHELINE_DIRTY    1
+
+
+/* Cache simulator Options */
+static Bool clo_simulate_writeback = False;
+static Bool clo_simulate_hwpref = False;
+static Bool clo_simulate_sectors = False;
+static Bool clo_collect_cacheuse = False;
+
+/* Following global vars are setup before by setup_bbcc():
+ *
+ * - Addr   TG_(bb_base)     (instruction start address of original BB)
+ * - ULong* TG_(cost_base)   (start of cost array for BB)
+ */
+
+Addr   TG_(bb_base);
+ULong* TG_(cost_base);
+
+static InstrInfo* current_ii;
+
+/* Cache use offsets */
+/* The offsets are only correct because all per-instruction event sets get
+ * the "Use" set added first !
+ */
+static Int off_I1_AcCost  = 0;
+static Int off_I1_SpLoss  = 1;
+static Int off_D1_AcCost  = 0;
+static Int off_D1_SpLoss  = 1;
+static Int off_LL_AcCost  = 2;
+static Int off_LL_SpLoss  = 3;
+
+/* Cache access types */
+typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
+
+/* Result of a reference into a flat cache */
+typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
+
+/* Result of a reference into a hierarchical cache model */
+typedef enum {
+    L1_Hit, 
+    LL_Hit,
+    MemAccess,
+    WriteBackMemAccess } CacheModelResult;
+
+typedef CacheModelResult (*simcall_type)(Addr, UChar);
+
+static struct {
+    simcall_type I1_Read;
+    simcall_type D1_Read;
+    simcall_type D1_Write;
+} simulator;
+
+/*------------------------------------------------------------*/
+/*--- Cache Simulator Initialization                       ---*/
+/*------------------------------------------------------------*/
+
+static void cachesim_clearcache(cache_t2* c)
+{
+  Int i;
+
+  for (i = 0; i < c->sets * c->assoc; i++)
+    c->tags[i] = 0;
+  if (c->use) {
+    for (i = 0; i < c->sets * c->assoc; i++) {
+      c->loaded[i].memline  = 0;
+      c->loaded[i].use_base = 0;
+      c->loaded[i].dep_use = 0;
+      c->loaded[i].iaddr = 0;
+      c->use[i].mask    = 0;
+      c->use[i].count   = 0;
+      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
+    }
+  }
+}
+
+static void cacheuse_initcache(cache_t2* c);
+
+/* By this point, the size/assoc/line_size has been checked. */
+static void cachesim_initcache(cache_t config, cache_t2* c)
+{
+   c->size      = config.size;
+   c->assoc     = config.assoc;
+   c->line_size = config.line_size;
+   c->sectored  = False; // FIXME
+
+   c->sets           = (c->size / c->line_size) / c->assoc;
+   c->sets_min_1     = c->sets - 1;
+   c->line_size_bits = VG_(log2)(c->line_size);
+   c->tag_shift     = c->line_size_bits + VG_(log2)(c->sets);
+   c->tag_mask       = ~((1u<<c->tag_shift)-1);
+
+   /* Can bits in tag entries be used for flags?
+    * Should be always true as MIN_LINE_SIZE >= 16 */
+   TG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
+
+   if (c->assoc == 1) {
+      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
+		   c->size, c->line_size,
+		   c->sectored ? ", sectored":"");
+   } else {
+      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
+		   c->size, c->line_size, c->assoc,
+		   c->sectored ? ", sectored":"");
+   }
+
+   c->tags = (UWord*) TG_MALLOC("cl.sim.cs_ic.1",
+                                 sizeof(UWord) * c->sets * c->assoc);
+   if (clo_collect_cacheuse)
+       cacheuse_initcache(c);
+   else
+     c->use = 0;
+   cachesim_clearcache(c);
+}
+
+
+#if 0
+static void print_cache(cache_t2* c)
+{
+   UInt set, way, i;
+
+   /* Note initialisation and update of 'i'. */
+   for (i = 0, set = 0; set < c->sets; set++) {
+      for (way = 0; way < c->assoc; way++, i++) {
+         VG_(printf)("%8x ", c->tags[i]);
+      }
+      VG_(printf)("\n");
+   }
+}
+#endif 
+
+
+/*------------------------------------------------------------*/
+/*--- Simple Cache Simulation                              ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * Model: single inclusive, 2-level cache hierarchy (L1/LL)
+ *        with write-allocate
+ *
+ * For simple cache hit/miss counts, we do not have to
+ * maintain the dirty state of lines (no need to distinguish
+ * read/write references), and the resulting counts are the
+ * same for write-through and write-back caches.
+ *
+ * Simulator functions:
+ *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+ */
+__attribute__((always_inline))
+static __inline__
+CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
+{
+    int i, j;
+    UWord *set;
+
+    set = &(c->tags[set_no * c->assoc]);
+
+    /* This loop is unrolled for just the first case, which is the most */
+    /* common.  We can't unroll any further because it would screw up   */
+    /* if we have a direct-mapped (1-way) cache.                        */
+    if (tag == set[0])
+        return Hit;
+
+    /* If the tag is one other than the MRU, move it into the MRU spot  */
+    /* and shuffle the rest down.                                       */
+    for (i = 1; i < c->assoc; i++) {
+        if (tag == set[i]) {
+            for (j = i; j > 0; j--) {
+                set[j] = set[j - 1];
+            }
+            set[0] = tag;
+            return Hit;
+        }
+    }
+
+    /* A miss;  install this tag as MRU, shuffle rest down. */
+    for (j = c->assoc - 1; j > 0; j--) {
+        set[j] = set[j - 1];
+    }
+    set[0] = tag;
+
+    return Miss;
+}
+
+__attribute__((always_inline))
+static __inline__
+CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
+{
+    UWord block1 =  a         >> c->line_size_bits;
+    UWord block2 = (a+size-1) >> c->line_size_bits;
+    UInt  set1   = block1 & c->sets_min_1;
+    /* the tag does not need to include bits specifying the set,
+     * but it can, and this saves instructions */
+    UWord tag1   = block1;
+
+    /* Access entirely within line. */
+    if (block1 == block2)
+	return cachesim_setref(c, set1, tag1);
+
+    /* Access straddles two lines. */
+    else if (block1 + 1 == block2) {
+        UInt  set2 = block2 & c->sets_min_1;
+        UWord tag2 = block2;
+
+	/* the call updates cache structures as side effect */
+	CacheResult res1 =  cachesim_setref(c, set1, tag1);
+	CacheResult res2 =  cachesim_setref(c, set2, tag2);
+	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+       VG_(printf)("addr: %lx  size: %u  blocks: %lu %lu",
+		   a, size, block1, block2);
+       VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+
+static
+CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
+    return MemAccess;
+}
+
+static
+CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
+    return MemAccess;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Write Back Cache Simulation                          ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * More complex model: L1 Write-through, LL Write-back
+ * This needs to distinguish among read and write references.
+ *
+ * Simulator functions:
+ *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+ */
+
+/*
+ * With write-back, result can be a miss evicting a dirty line
+ * The dirty state of a cache line is stored in Bit0 of the tag for
+ * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
+ * type (Read/Write), the line gets dirty on a write.
+ */
+__attribute__((always_inline))
+static __inline__
+CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
+{
+    int i, j;
+    UWord *set, tmp_tag;
+
+    set = &(c->tags[set_no * c->assoc]);
+
+    /* This loop is unrolled for just the first case, which is the most */
+    /* common.  We can't unroll any further because it would screw up   */
+    /* if we have a direct-mapped (1-way) cache.                        */
+    if (tag == (set[0] & ~CACHELINE_DIRTY)) {
+	set[0] |= ref;
+        return Hit;
+    }
+    /* If the tag is one other than the MRU, move it into the MRU spot  */
+    /* and shuffle the rest down.                                       */
+    for (i = 1; i < c->assoc; i++) {
+	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
+	    tmp_tag = set[i] | ref; // update dirty flag
+            for (j = i; j > 0; j--) {
+                set[j] = set[j - 1];
+            }
+            set[0] = tmp_tag;
+            return Hit;
+        }
+    }
+
+    /* A miss;  install this tag as MRU, shuffle rest down. */
+    tmp_tag = set[c->assoc - 1];
+    for (j = c->assoc - 1; j > 0; j--) {
+        set[j] = set[j - 1];
+    }
+    set[0] = tag | ref;
+
+    return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
+}
+
+__attribute__((always_inline))
+static __inline__
+CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
+{
+    UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
+    UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
+    UWord tag = a & c->tag_mask;
+
+    /* Access entirely within line. */
+    if (set1 == set2)
+	return cachesim_setref_wb(c, ref, set1, tag);
+
+    /* Access straddles two lines. */
+    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
+    else if (((set1 + 1) & (c->sets_min_1)) == set2) {
+	UWord tag2  = (a+size-1) & c->tag_mask;
+
+	/* the call updates cache structures as side effect */
+	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
+	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
+
+	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
+	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+       VG_(printf)("addr: %lx  size: %u  sets: %u %u", a, size, set1, set2);
+       VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+
+
+static
+CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) {
+	/* Even for a L1 hit, the write-trough L1 passes
+	 * the write to the LL to make the LL line dirty.
+	 * But this causes no latency, so return the hit.
+	 */
+	cachesim_ref_wb( &LL, Write, a, size);
+	return L1_Hit;
+    }
+    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+	case Hit: return LL_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Hardware Prefetch Simulation                         ---*/
+/*------------------------------------------------------------*/
+
+static ULong prefetch_up = 0;
+static ULong prefetch_down = 0;
+
+#define PF_STREAMS  8
+#define PF_PAGEBITS 12
+
+static UInt pf_lastblock[PF_STREAMS];
+static Int  pf_seqblocks[PF_STREAMS];
+
+static
+void prefetch_clear(void)
+{
+  int i;
+  for(i=0;i<PF_STREAMS;i++)
+    pf_lastblock[i] = pf_seqblocks[i] = 0;
+}
+
+/*
+ * HW Prefetch emulation
+ * Start prefetching when detecting sequential access to 3 memory blocks.
+ * One stream can be detected per 4k page.
+ */
+static __inline__
+void prefetch_LL_doref(Addr a)
+{
+  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
+  UInt block = ( a >> LL.line_size_bits);
+
+  if (block != pf_lastblock[stream]) {
+    if (pf_seqblocks[stream] == 0) {
+      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
+      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
+    }
+    else if (pf_seqblocks[stream] >0) {
+      if (pf_lastblock[stream] +1 == block) {
+	pf_seqblocks[stream]++;
+	if (pf_seqblocks[stream] >= 2) {
+	  prefetch_up++;
+	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
+	}
+      }
+      else pf_seqblocks[stream] = 0;
+    }
+    else if (pf_seqblocks[stream] <0) {
+      if (pf_lastblock[stream] -1 == block) {
+	pf_seqblocks[stream]--;
+	if (pf_seqblocks[stream] <= -2) {
+	  prefetch_down++;
+	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
+	}
+      }
+      else pf_seqblocks[stream] = 0;
+    }
+    pf_lastblock[stream] = block;
+  }
+}  
+
+/* simple model with hardware prefetch */
+
+static
+CacheModelResult prefetch_I1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
+    return MemAccess;
+}
+
+static
+CacheModelResult prefetch_D1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
+    return MemAccess;
+}
+
+
+/* complex model with hardware prefetch */
+
+static
+CacheModelResult prefetch_I1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    prefetch_LL_doref(a);
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult prefetch_D1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    prefetch_LL_doref(a);
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult prefetch_D1_Write(Addr a, UChar size)
+{
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &D1, a, size) == Hit ) {
+	/* Even for a L1 hit, the write-trough L1 passes
+	 * the write to the LL to make the LL line dirty.
+	 * But this causes no latency, so return the hit.
+	 */
+	cachesim_ref_wb( &LL, Write, a, size);
+	return L1_Hit;
+    }
+    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+	case Hit: return LL_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Cache Simulation with use metric collection          ---*/
+/*------------------------------------------------------------*/
+
+/* can not be combined with write-back or prefetch */
+
+static
+void cacheuse_initcache(cache_t2* c)
+{
+    int i;
+    unsigned int start_mask, start_val;
+    unsigned int end_mask, end_val;
+
+    c->use    = TG_MALLOC("cl.sim.cu_ic.1",
+                           sizeof(line_use) * c->sets * c->assoc);
+    c->loaded = TG_MALLOC("cl.sim.cu_ic.2",
+                           sizeof(line_loaded) * c->sets * c->assoc);
+    c->line_start_mask = TG_MALLOC("cl.sim.cu_ic.3",
+                                    sizeof(int) * c->line_size);
+    c->line_end_mask = TG_MALLOC("cl.sim.cu_ic.4",
+                                  sizeof(int) * c->line_size);
+    
+    c->line_size_mask = c->line_size-1;
+
+    /* Meaning of line_start_mask/line_end_mask
+     * Example: for a given cache line, you get an access starting at
+     * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
+     * line size of 32, you have 1 bit per byte in the mask:
+     *
+     *   bit31   bit8 bit5  bit 0
+     *       |      |  |    |
+     *       11..111111100000   line_start_mask[5]
+     *       00..000111111111   line_end_mask[(5+4)-1]
+     *
+     *  use_mask |= line_start_mask[5] && line_end_mask[8]
+     *
+     */
+    start_val = end_val = ~0;
+    if (c->line_size < 32) {
+	int bits_per_byte = 32/c->line_size;
+	start_mask = (1<<bits_per_byte)-1;
+	end_mask   = start_mask << (32-bits_per_byte);
+	for(i=0;i<c->line_size;i++) {
+	    c->line_start_mask[i] = start_val;
+	    start_val  = start_val & ~start_mask;
+	    start_mask = start_mask << bits_per_byte;
+	    
+	    c->line_end_mask[c->line_size-i-1] = end_val;
+	    end_val  = end_val & ~end_mask;
+	    end_mask = end_mask >> bits_per_byte;
+	}
+    }
+    else {
+	int bytes_per_bit = c->line_size/32;
+	start_mask = 1;
+	end_mask   = 1u << 31;
+	for(i=0;i<c->line_size;i++) {
+	    c->line_start_mask[i] = start_val;
+	    c->line_end_mask[c->line_size-i-1] = end_val;
+	    if ( ((i+1)%bytes_per_bit) == 0) {
+		start_val   &= ~start_mask;
+		end_val     &= ~end_mask;
+		start_mask <<= 1;
+		end_mask   >>= 1;
+	    }
+	}
+    }
+    
+    TG_DEBUG(6, "Config %s:\n", c->desc_line);
+    for(i=0;i<c->line_size;i++) {
+	TG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
+		  i, (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
+    }
+    
+    /* We use lower tag bits as offset pointers to cache use info.
+     * I.e. some cache parameters don't work.
+     */
+    if ( (1<<c->tag_shift) < c->assoc) {
+	VG_(message)(Vg_DebugMsg,
+		     "error: Use associativity < %d for cache use statistics!\n",
+		     (1<<c->tag_shift) );
+	VG_(tool_panic)("Unsupported cache configuration");
+    }
+}
+    
+
+/* for I1/D1 caches */
+#define CACHEUSE(L)                                                         \
+                                                                            \
+static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
+{                                                                           \
+   UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
+   UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
+   UWord tag  = a & L.tag_mask;                                             \
+   UWord tag2;                                                              \
+   int i, j, idx;                                                           \
+   UWord *set, tmp_tag; 						    \
+   UInt use_mask;							    \
+                                                                            \
+   TG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n",                \
+	    L.name, a, size, set1, set2);				    \
+                                                                            \
+   /* First case: word entirely within line. */                             \
+   if (set1 == set2) {                                                      \
+                                                                            \
+      set = &(L.tags[set1 * L.assoc]);                                      \
+      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
+	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
+                                                                            \
+      /* This loop is unrolled for just the first case, which is the most */\
+      /* common.  We can't unroll any further because it would screw up   */\
+      /* if we have a direct-mapped (1-way) cache.                        */\
+      if (tag == (set[0] & L.tag_mask)) {                                   \
+        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
+        L.use[idx].count ++;                                                \
+        L.use[idx].mask |= use_mask;                                        \
+	TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
+		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+	return L1_Hit;							    \
+      }                                                                     \
+      /* If the tag is one other than the MRU, move it into the MRU spot  */\
+      /* and shuffle the rest down.                                       */\
+      for (i = 1; i < L.assoc; i++) {                                       \
+	 if (tag == (set[i] & L.tag_mask)) {			            \
+  	    tmp_tag = set[i];                                               \
+            for (j = i; j > 0; j--) {                                       \
+               set[j] = set[j - 1];                                         \
+            }                                                               \
+            set[0] = tmp_tag;			                            \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
+            L.use[idx].count ++;                                            \
+            L.use[idx].mask |= use_mask;                                    \
+	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
+		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+            return L1_Hit;                                                  \
+         }                                                                  \
+      }                                                                     \
+                                                                            \
+      /* A miss;  install this tag as MRU, shuffle rest down. */            \
+      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+      for (j = L.assoc - 1; j > 0; j--) {                                   \
+         set[j] = set[j - 1];                                               \
+      }                                                                     \
+      set[0] = tag | tmp_tag;                                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
+      return update_##L##_use(&L, idx,         			            \
+		       use_mask, a &~ L.line_size_mask);		    \
+                                                                            \
+   /* Second case: word straddles two lines. */                             \
+   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
+   } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
+      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
+      set = &(L.tags[set1 * L.assoc]);                                      \
+      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
+      if (tag == (set[0] & L.tag_mask)) {                                   \
+         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+         L.use[idx].count ++;                                               \
+         L.use[idx].mask |= use_mask;                                       \
+	TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
+		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+         goto block2;                                                       \
+      }                                                                     \
+      for (i = 1; i < L.assoc; i++) {                                       \
+	 if (tag == (set[i] & L.tag_mask)) {			            \
+  	    tmp_tag = set[i];                                               \
+            for (j = i; j > 0; j--) {                                       \
+               set[j] = set[j - 1];                                         \
+            }                                                               \
+            set[0] = tmp_tag;                                               \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
+            L.use[idx].count ++;                                            \
+            L.use[idx].mask |= use_mask;                                    \
+	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
+		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+            goto block2;                                                    \
+         }                                                                  \
+      }                                                                     \
+      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+      for (j = L.assoc - 1; j > 0; j--) {                                   \
+         set[j] = set[j - 1];                                               \
+      }                                                                     \
+      set[0] = tag | tmp_tag;                                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
+      miss1 = update_##L##_use(&L, idx,        			            \
+		       use_mask, a &~ L.line_size_mask);		    \
+block2:                                                                     \
+      set = &(L.tags[set2 * L.assoc]);                                      \
+      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
+      tag2  = (a+size-1) & L.tag_mask;                                      \
+      if (tag2 == (set[0] & L.tag_mask)) {                                  \
+         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+         L.use[idx].count ++;                                               \
+         L.use[idx].mask |= use_mask;                                       \
+	TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
+		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+         return miss1;                                                      \
+      }                                                                     \
+      for (i = 1; i < L.assoc; i++) {                                       \
+	 if (tag2 == (set[i] & L.tag_mask)) {			            \
+  	    tmp_tag = set[i];                                               \
+            for (j = i; j > 0; j--) {                                       \
+               set[j] = set[j - 1];                                         \
+            }                                                               \
+            set[0] = tmp_tag;                                               \
+            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
+            L.use[idx].count ++;                                            \
+            L.use[idx].mask |= use_mask;                                    \
+	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
+		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+            return miss1;                                                   \
+         }                                                                  \
+      }                                                                     \
+      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+      for (j = L.assoc - 1; j > 0; j--) {                                   \
+         set[j] = set[j - 1];                                               \
+      }                                                                     \
+      set[0] = tag2 | tmp_tag;                                              \
+      idx = (set2 * L.assoc) + tmp_tag;                                     \
+      miss2 = update_##L##_use(&L, idx,			                    \
+		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
+      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
+                                                                            \
+   } else {                                                                 \
+       VG_(printf)("addr: %#lx  size: %u  sets: %u %u", a, size, set1, set2); \
+       VG_(tool_panic)("item straddles more than two cache sets");          \
+   }                                                                        \
+   return 0;                                                                \
+}
+
+
+/* logarithmic bitcounting algorithm, see
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ */
+static __inline__ unsigned int countBits(unsigned int bits)
+{
+  unsigned int c; // store the total here
+  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
+  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
+
+  c = bits;
+  c = ((c >> S[0]) & B[0]) + (c & B[0]);
+  c = ((c >> S[1]) & B[1]) + (c & B[1]);
+  c = ((c >> S[2]) & B[2]) + (c & B[2]);
+  c = ((c >> S[3]) & B[3]) + (c & B[3]);
+  c = ((c >> S[4]) & B[4]) + (c & B[4]);
+  return c;
+}
+
+static void update_LL_use(int idx, Addr memline)
+{
+  line_loaded* loaded = &(LL.loaded[idx]);
+  line_use* use = &(LL.use[idx]);
+  int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
+  
+  TG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
+           idx, TG_(bb_base) + current_ii->instr_offset, memline);
+  if (use->count>0) {
+    TG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
+	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
+    TG_DEBUG(2, "   collect: %d, use_base %p\n",
+	     TG_(current_state).collect, loaded->use_base);
+    
+    if (TG_(current_state).collect && loaded->use_base) {
+      (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
+      (loaded->use_base)[off_LL_SpLoss] += i;
+    }
+   }
+
+   use->count = 0;
+   use->mask  = 0;
+
+  loaded->memline = memline;
+  loaded->iaddr   = TG_(bb_base) + current_ii->instr_offset;
+  loaded->use_base = (TG_(current_state).nonskipped) ?
+    TG_(current_state).nonskipped->skipped :
+    TG_(cost_base) + current_ii->cost_offset;
+}
+
+static
+CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
+{
+   UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
+   UWord* set = &(LL.tags[setNo * LL.assoc]);
+   UWord tag  = memline & LL.tag_mask;
+
+   int i, j, idx;
+   UWord tmp_tag;
+   
+   TG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
+
+   if (tag == (set[0] & LL.tag_mask)) {
+     idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
+     l1_loaded->dep_use = &(LL.use[idx]);
+
+     TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
+		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
+		 LL.use[idx].mask, LL.use[idx].count);
+     return LL_Hit;
+   }
+   for (i = 1; i < LL.assoc; i++) {
+     if (tag == (set[i] & LL.tag_mask)) {
+       tmp_tag = set[i];
+       for (j = i; j > 0; j--) {
+	 set[j] = set[j - 1];
+       }
+       set[0] = tmp_tag;
+       idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
+       l1_loaded->dep_use = &(LL.use[idx]);
+
+	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
+		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
+		 LL.use[idx].mask, LL.use[idx].count);
+	return LL_Hit;
+     }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
+   for (j = LL.assoc - 1; j > 0; j--) {
+     set[j] = set[j - 1];
+   }
+   set[0] = tag | tmp_tag;
+   idx = (setNo * LL.assoc) + tmp_tag;
+   l1_loaded->dep_use = &(LL.use[idx]);
+
+   update_LL_use(idx, memline);
+
+   return MemAccess;
+}
+
+
+
+
+#define UPDATE_USE(L)					             \
+                                                                     \
+static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
+			       UInt mask, Addr memline)		     \
+{                                                                    \
+  line_loaded* loaded = &(cache->loaded[idx]);			     \
+  line_use* use = &(cache->use[idx]);				     \
+  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
+                                                                     \
+  TG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
+           cache->name, idx, TG_(bb_base) + current_ii->instr_offset, memline, mask); \
+  if (use->count>0) {                                                \
+    TG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
+	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
+    TG_DEBUG(2, "   collect: %d, use_base %p\n", \
+	     TG_(current_state).collect, loaded->use_base);	     \
+                                                                     \
+    if (TG_(current_state).collect && loaded->use_base) {           \
+      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
+      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
+                                                                     \
+      /* FIXME (?): L1/LL line sizes must be equal ! */              \
+      loaded->dep_use->mask |= use->mask;                            \
+      loaded->dep_use->count += use->count;                          \
+    }                                                                \
+  }                                                                  \
+                                                                     \
+  use->count = 1;                                                    \
+  use->mask  = mask;                                                 \
+  loaded->memline = memline;                                         \
+  loaded->iaddr   = TG_(bb_base) + current_ii->instr_offset;        \
+  loaded->use_base = (TG_(current_state).nonskipped) ?              \
+    TG_(current_state).nonskipped->skipped :                        \
+    TG_(cost_base) + current_ii->cost_offset;                       \
+                                                                     \
+  if (memline == 0) return LL_Hit;                                   \
+  return cacheuse_LL_access(memline, loaded);                        \
+}
+
+UPDATE_USE(I1);
+UPDATE_USE(D1);
+
+CACHEUSE(I1);
+CACHEUSE(D1);
+
+
+static
+void cacheuse_finish(void)
+{
+  int i;
+  InstrInfo ii = { 0,0,0,0 };
+
+  if (!TG_(current_state).collect) return;
+
+  TG_(bb_base) = 0;
+  current_ii = &ii; /* needs to be set for update_XX_use */
+  TG_(cost_base) = 0;
+
+  /* update usage counters */
+  if (I1.use)
+    for (i = 0; i < I1.sets * I1.assoc; i++)
+      if (I1.loaded[i].use_base)
+	update_I1_use( &I1, i, 0,0);
+
+  if (D1.use)
+    for (i = 0; i < D1.sets * D1.assoc; i++)
+      if (D1.loaded[i].use_base)
+	update_D1_use( &D1, i, 0,0);
+
+  if (LL.use)
+    for (i = 0; i < LL.sets * LL.assoc; i++)
+      if (LL.loaded[i].use_base)
+	update_LL_use(i, 0);
+
+  current_ii = 0;
+}
+  
+
+
+/*------------------------------------------------------------*/
+/*--- Helper functions called by instrumented code         ---*/
+/*------------------------------------------------------------*/
+
+
+static __inline__
+void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
+{
+    switch(r) {
+	case WriteBackMemAccess:
+	    if (clo_simulate_writeback) {
+		c1[3]++;
+		c2[3]++;
+	    }
+	    // fall through
+
+	case MemAccess:
+	    c1[2]++;
+	    c2[2]++;
+	    // fall through
+
+	case LL_Hit:
+	    c1[1]++;
+	    c2[1]++;
+	    // fall through
+
+	default:
+	    c1[0]++;
+	    c2[0]++;
+    }
+}
+
+static
+const HChar* cacheRes(CacheModelResult r)
+{
+    switch(r) {
+    case L1_Hit:    return "L1 Hit ";
+    case LL_Hit:    return "LL Hit ";
+    case MemAccess: return "LL Miss";
+    case WriteBackMemAccess: return "LL Miss (dirty)";
+    default:
+	tl_assert(0);
+    }
+    return "??";
+}
+
+VG_REGPARM(1)
+static void log_1I0D(InstrInfo* ii)
+{
+    CacheModelResult IrRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+
+    TG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
+              TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
+
+    if (TG_(current_state).collect) {
+	ULong* cost_Ir;
+
+	if (TG_(current_state).nonskipped)
+	    cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+	else
+            cost_Ir = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+
+	inc_costs(IrRes, cost_Ir, 
+		  TG_(current_state).cost + fullOffset(EG_IR) );
+    }
+}
+
+VG_REGPARM(2)
+static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
+{
+    CacheModelResult Ir1Res, Ir2Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
+
+    TG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
+              TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+              TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
+
+    if (!TG_(current_state).collect) return;
+
+    global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
+    if (TG_(current_state).nonskipped) {
+	ULong* skipped_cost_Ir =
+	    TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+
+	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+	return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir,
+              TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
+    inc_costs(Ir2Res, global_cost_Ir,
+              TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
+}
+
+VG_REGPARM(3)
+static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
+{
+    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
+    current_ii = ii3;
+    Ir3Res = (*simulator.I1_Read)(TG_(bb_base) + ii3->instr_offset, ii3->instr_size);
+
+    TG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
+              TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+              TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
+              TG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
+
+    if (!TG_(current_state).collect) return;
+
+    global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
+    if (TG_(current_state).nonskipped) {
+	ULong* skipped_cost_Ir =
+	    TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
+	return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir,
+              TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
+    inc_costs(Ir2Res, global_cost_Ir,
+              TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
+    inc_costs(Ir3Res, global_cost_Ir,
+              TG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
+}
+
+/* Instruction doing a read access */
+
+VG_REGPARM(3)
+static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+    CacheModelResult IrRes, DrRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
+
+    TG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%ld => %s\n",
+              TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+	      data_addr, data_size, cacheRes(DrRes));
+
+    if (TG_(current_state).collect) {
+	ULong *cost_Ir, *cost_Dr;
+	
+	if (TG_(current_state).nonskipped) {
+	    cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+	    cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
+	}
+	else {
+            cost_Ir = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+            cost_Dr = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
+	}
+       
+	inc_costs(IrRes, cost_Ir, 
+		  TG_(current_state).cost + fullOffset(EG_IR) );
+	inc_costs(DrRes, cost_Dr,
+		  TG_(current_state).cost + fullOffset(EG_DR) );
+    }
+}
+
+
+/* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
+   have exactly the same prototype.  If you change them, you must
+   change addEvent_D_guarded too. */
+VG_REGPARM(3)
+static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+    CacheModelResult DrRes;
+
+    current_ii = ii;
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
+
+    TG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%ld => %s\n",
+	      data_addr, data_size, cacheRes(DrRes));
+
+    if (TG_(current_state).collect) {
+	ULong *cost_Dr;
+	
+	if (TG_(current_state).nonskipped)
+	    cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
+	else
+            cost_Dr = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
+
+	inc_costs(DrRes, cost_Dr,
+		  TG_(current_state).cost + fullOffset(EG_DR) );
+    }
+}
+
+
+/* Instruction doing a write access */
+
+VG_REGPARM(3)
+static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+    CacheModelResult IrRes, DwRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
+
+    TG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%ld => %s\n",
+              TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+	      data_addr, data_size, cacheRes(DwRes));
+
+    if (TG_(current_state).collect) {
+	ULong *cost_Ir, *cost_Dw;
+	
+	if (TG_(current_state).nonskipped) {
+	    cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+	    cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
+	}
+	else {
+            cost_Ir = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+            cost_Dw = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
+	}
+       
+	inc_costs(IrRes, cost_Ir,
+		  TG_(current_state).cost + fullOffset(EG_IR) );
+	inc_costs(DwRes, cost_Dw,
+		  TG_(current_state).cost + fullOffset(EG_DW) );
+    }
+}
+
+/* See comment on log_0I1Dr. */
+VG_REGPARM(3)
+static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+    CacheModelResult DwRes;
+
+    current_ii = ii;
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
+
+    TG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%ld => %s\n",
+	      data_addr, data_size, cacheRes(DwRes));
+
+    if (TG_(current_state).collect) {
+	ULong *cost_Dw;
+	
+	if (TG_(current_state).nonskipped)
+	    cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
+	else
+            cost_Dw = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
+       
+	inc_costs(DwRes, cost_Dw,
+		  TG_(current_state).cost + fullOffset(EG_DW) );
+    }
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- Cache configuration                                  ---*/
+/*------------------------------------------------------------*/
+
+static cache_t clo_I1_cache = UNDEFINED_CACHE;
+static cache_t clo_D1_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;
+
+/* Initialize and clear simulator state */
+static void cachesim_post_clo_init(void)
+{
+  /* Cache configurations. */
+  cache_t  I1c, D1c, LLc;
+
+  /* Initialize access handlers */
+  if (!TG_(clo).simulate_cache) {
+    TG_(cachesim).log_1I0D  = 0;
+    TG_(cachesim).log_1I0D_name = "(no function)";
+    TG_(cachesim).log_2I0D  = 0;
+    TG_(cachesim).log_2I0D_name = "(no function)";
+    TG_(cachesim).log_3I0D  = 0;
+    TG_(cachesim).log_3I0D_name = "(no function)";
+
+    TG_(cachesim).log_1I1Dr = 0;
+    TG_(cachesim).log_1I1Dr_name = "(no function)";
+    TG_(cachesim).log_1I1Dw = 0;
+    TG_(cachesim).log_1I1Dw_name = "(no function)";
+
+    TG_(cachesim).log_0I1Dr = 0;
+    TG_(cachesim).log_0I1Dr_name = "(no function)";
+    TG_(cachesim).log_0I1Dw = 0;
+    TG_(cachesim).log_0I1Dw_name = "(no function)";
+    return;
+  }
+
+  /* Configuration of caches only needed with real cache simulation */
+  VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
+                                      &clo_I1_cache,
+                                      &clo_D1_cache,
+                                      &clo_LL_cache);
+
+  I1.name = "I1";
+  D1.name = "D1";
+  LL.name = "LL";
+
+  // min_line_size is used to make sure that we never feed
+  // accesses to the simulator straddling more than two
+  // cache lines at any cache level
+  TG_(min_line_size) = (I1c.line_size < D1c.line_size)
+                           ? I1c.line_size : D1c.line_size;
+  TG_(min_line_size) = (LLc.line_size < TG_(min_line_size))
+                           ? LLc.line_size : TG_(min_line_size);
+
+  Int largest_load_or_store_size
+     = VG_(machine_get_size_of_largest_guest_register)();
+  if (TG_(min_line_size) < largest_load_or_store_size) {
+     /* We can't continue, because the cache simulation might
+        straddle more than 2 lines, and it will assert.  So let's
+        just stop before we start. */
+     VG_(umsg)("Tracegrind: cannot continue: the minimum line size (%d)\n",
+               (Int)TG_(min_line_size));
+     VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
+               largest_load_or_store_size );
+     VG_(umsg)("  but it is not.  Exiting now.\n");
+     VG_(exit)(1);
+  }
+
+  cachesim_initcache(I1c, &I1);
+  cachesim_initcache(D1c, &D1);
+  cachesim_initcache(LLc, &LL);
+
+  /* the other cache simulators use the standard helpers
+   * with dispatching via simulator struct */
+
+  TG_(cachesim).log_1I0D  = log_1I0D;
+  TG_(cachesim).log_1I0D_name  = "log_1I0D";
+  TG_(cachesim).log_2I0D  = log_2I0D;
+  TG_(cachesim).log_2I0D_name  = "log_2I0D";
+  TG_(cachesim).log_3I0D  = log_3I0D;
+  TG_(cachesim).log_3I0D_name  = "log_3I0D";
+
+  TG_(cachesim).log_1I1Dr = log_1I1Dr;
+  TG_(cachesim).log_1I1Dw = log_1I1Dw;
+  TG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
+  TG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
+
+  TG_(cachesim).log_0I1Dr = log_0I1Dr;
+  TG_(cachesim).log_0I1Dw = log_0I1Dw;
+  TG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
+  TG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
+
+  if (clo_collect_cacheuse) {
+
+      /* Output warning for not supported option combinations */
+      if (clo_simulate_hwpref) {
+	  VG_(message)(Vg_DebugMsg,
+		       "warning: prefetch simulation can not be "
+                       "used with cache usage\n");
+	  clo_simulate_hwpref = False;
+      }
+
+      if (clo_simulate_writeback) {
+	  VG_(message)(Vg_DebugMsg,
+		       "warning: write-back simulation can not be "
+                       "used with cache usage\n");
+	  clo_simulate_writeback = False;
+      }
+
+      simulator.I1_Read  = cacheuse_I1_doRead;
+      simulator.D1_Read  = cacheuse_D1_doRead;
+      simulator.D1_Write = cacheuse_D1_doRead;
+      return;
+  }
+
+  if (clo_simulate_hwpref) {
+    prefetch_clear();
+
+    if (clo_simulate_writeback) {
+      simulator.I1_Read  = prefetch_I1_Read;
+      simulator.D1_Read  = prefetch_D1_Read;
+      simulator.D1_Write = prefetch_D1_Write;
+    }
+    else {
+      simulator.I1_Read  = prefetch_I1_ref;
+      simulator.D1_Read  = prefetch_D1_ref;
+      simulator.D1_Write = prefetch_D1_ref;
+    }
+
+    return;
+  }
+
+  if (clo_simulate_writeback) {
+      simulator.I1_Read  = cachesim_I1_Read;
+      simulator.D1_Read  = cachesim_D1_Read;
+      simulator.D1_Write = cachesim_D1_Write;
+  }
+  else {
+      simulator.I1_Read  = cachesim_I1_ref;
+      simulator.D1_Read  = cachesim_D1_ref;
+      simulator.D1_Write = cachesim_D1_ref;
+  }
+}
+
+
+/* Clear simulator state. Has to be initialized before */
+static
+void cachesim_clear(void)
+{
+  cachesim_clearcache(&I1);
+  cachesim_clearcache(&D1);
+  cachesim_clearcache(&LL);
+
+  prefetch_clear();
+}
+
+
+static void cachesim_dump_desc(VgFile *fp)
+{
+  VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
+  VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
+  VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
+}
+
+static
+void cachesim_print_opts(void)
+{
+  VG_(printf)(
+"\n   cache simulator options (does cache simulation if used):\n"
+"    --simulate-wb=no|yes      Count write-back events [no]\n"
+"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
+#if TG_EXPERIMENTAL
+"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
+#endif
+"    --cacheuse=no|yes         Collect cache block use [no]\n");
+  VG_(print_cache_clo_opts)();
+}
+
+/* Check for command line option for cache configuration.
+ * Return False if unknown and not handled.
+ *
+ * Called from TG_(process_cmd_line_option)() in clo.c
+ */
+static Bool cachesim_parse_opt(const HChar* arg)
+{
+   if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
+   else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
+   else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
+
+   else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
+      if (clo_collect_cacheuse) {
+         /* Use counters only make sense with fine dumping */
+         TG_(clo).dump_instr = True;
+      }
+   }
+
+   else if (VG_(str_clo_cache_opt)(arg,
+                                   &clo_I1_cache,
+                                   &clo_D1_cache,
+                                   &clo_LL_cache)) {}
+
+   else
+     return False;
+
+  return True;
+}
+
+static
+void cachesim_printstat(Int l1, Int l2, Int l3)
+{
+  FullCost total = TG_(total_cost), D_total = 0;
+  ULong LL_total_m, LL_total_mr, LL_total_mw,
+    LL_total, LL_total_r, LL_total_w;
+
+  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
+    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n", 
+		 prefetch_up);
+    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n", 
+		 prefetch_down);
+    VG_(message)(Vg_DebugMsg, "\n");
+  }
+
+  VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
+               total[fullOffset(EG_IR) +1]);
+
+  VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
+               total[fullOffset(EG_IR) +2]);
+
+  if (0 == total[fullOffset(EG_IR)])
+    total[fullOffset(EG_IR)] = 1;
+
+  VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
+               total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
+       
+  VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
+               total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
+
+  VG_(message)(Vg_UserMsg, "\n");
+   
+  /* D cache results.
+     Use the D_refs.rd and D_refs.wr values to determine the
+   * width of columns 2 & 3. */
+
+  D_total = TG_(get_eventset_cost)( TG_(sets).full );
+  TG_(init_cost)( TG_(sets).full, D_total);
+  // we only use the first 3 values of D_total, adding up Dr and Dw costs
+  TG_(copy_cost)( TG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
+  TG_(add_cost) ( TG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
+
+  VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
+               l1, D_total[0],
+               l2, total[fullOffset(EG_DR)],
+               l3, total[fullOffset(EG_DW)]);
+
+  VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
+               l1, D_total[1],
+               l2, total[fullOffset(EG_DR)+1],
+               l3, total[fullOffset(EG_DW)+1]);
+
+  VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
+               l1, D_total[2],
+               l2, total[fullOffset(EG_DR)+2],
+               l3, total[fullOffset(EG_DW)+2]);
+
+  if (0 == D_total[0])   D_total[0] = 1;
+  if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
+  if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
+  
+  VG_(message)(Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", 
+           l1, D_total[1] * 100.0 / D_total[0],
+           l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
+           l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
+  
+  VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", 
+           l1, D_total[2] * 100.0 / D_total[0],
+           l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
+           l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
+  VG_(message)(Vg_UserMsg, "\n");
+
+
+  
+  /* LL overall results */
+  
+  LL_total   =
+    total[fullOffset(EG_DR) +1] +
+    total[fullOffset(EG_DW) +1] +
+    total[fullOffset(EG_IR) +1];
+  LL_total_r =
+    total[fullOffset(EG_DR) +1] +
+    total[fullOffset(EG_IR) +1];
+  LL_total_w = total[fullOffset(EG_DW) +1];
+  VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
+               l1, LL_total, l2, LL_total_r, l3, LL_total_w);
+  
+  LL_total_m  =
+    total[fullOffset(EG_DR) +2] +
+    total[fullOffset(EG_DW) +2] +
+    total[fullOffset(EG_IR) +2];
+  LL_total_mr =
+    total[fullOffset(EG_DR) +2] +
+    total[fullOffset(EG_IR) +2];
+  LL_total_mw = total[fullOffset(EG_DW) +2];
+  VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
+               l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
+  
+  VG_(message)(Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
+          l1, LL_total_m  * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
+          l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
+          l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Setup for Event set.                                 ---*/
+/*------------------------------------------------------------*/
+
+struct event_sets TG_(sets);
+
+void TG_(init_eventsets)(void)
+{
+    // Event groups from which the event sets are composed
+    // the "Use" group only is used with "cacheuse" simulation
+    if (clo_collect_cacheuse)
+	TG_(register_event_group4)(EG_USE,
+				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
+
+    if (!TG_(clo).simulate_cache)
+	TG_(register_event_group)(EG_IR, "Ir");
+    else if (!clo_simulate_writeback) {
+	TG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
+	TG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
+	TG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
+    }
+    else { // clo_simulate_writeback
+	TG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
+        TG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
+        TG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
+    }
+
+    if (TG_(clo).simulate_branch) {
+        TG_(register_event_group2)(EG_BC, "Bc", "Bcm");
+        TG_(register_event_group2)(EG_BI, "Bi", "Bim");
+    }
+
+    if (TG_(clo).collect_bus)
+	TG_(register_event_group)(EG_BUS, "Ge");
+
+    if (TG_(clo).collect_alloc)
+	TG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
+
+    if (TG_(clo).collect_systime != systime_no) {
+       if (TG_(clo).collect_systime == systime_nsec)
+          TG_(register_event_group3)(EG_SYS, "sysCount", "sysTime", "sysCpuTime");
+       else
+          TG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
+    }
+
+    // event set used as base for instruction self cost
+    TG_(sets).base = TG_(get_event_set2)(EG_USE, EG_IR);
+
+    // event set comprising all event groups, used for inclusive cost
+    TG_(sets).full = TG_(add_event_group2)(TG_(sets).base, EG_DR, EG_DW);
+    TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_BC, EG_BI);
+    TG_(sets).full = TG_(add_event_group) (TG_(sets).full, EG_BUS);
+    TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_ALLOC, EG_SYS);
+
+    TG_DEBUGIF(1) {
+	TG_DEBUG(1, "EventSets:\n");
+	TG_(print_eventset)(-2, TG_(sets).base);
+	TG_(print_eventset)(-2, TG_(sets).full);
+    }
+
+    /* Not-existing events are silently ignored */
+    TG_(dumpmap) = TG_(get_eventmapping)(TG_(sets).full);
+    TG_(append_event)(TG_(dumpmap), "Ir");
+    TG_(append_event)(TG_(dumpmap), "Dr");
+    TG_(append_event)(TG_(dumpmap), "Dw");
+    TG_(append_event)(TG_(dumpmap), "I1mr");
+    TG_(append_event)(TG_(dumpmap), "D1mr");
+    TG_(append_event)(TG_(dumpmap), "D1mw");
+    TG_(append_event)(TG_(dumpmap), "ILmr");
+    TG_(append_event)(TG_(dumpmap), "DLmr");
+    TG_(append_event)(TG_(dumpmap), "DLmw");
+    TG_(append_event)(TG_(dumpmap), "ILdmr");
+    TG_(append_event)(TG_(dumpmap), "DLdmr");
+    TG_(append_event)(TG_(dumpmap), "DLdmw");
+    TG_(append_event)(TG_(dumpmap), "Bc");
+    TG_(append_event)(TG_(dumpmap), "Bcm");
+    TG_(append_event)(TG_(dumpmap), "Bi");
+    TG_(append_event)(TG_(dumpmap), "Bim");
+    TG_(append_event)(TG_(dumpmap), "AcCost1");
+    TG_(append_event)(TG_(dumpmap), "SpLoss1");
+    TG_(append_event)(TG_(dumpmap), "AcCost2");
+    TG_(append_event)(TG_(dumpmap), "SpLoss2");
+    TG_(append_event)(TG_(dumpmap), "Ge");
+    TG_(append_event)(TG_(dumpmap), "allocCount");
+    TG_(append_event)(TG_(dumpmap), "allocSize");
+    TG_(append_event)(TG_(dumpmap), "sysCount");
+    TG_(append_event)(TG_(dumpmap), "sysTime");
+    TG_(append_event)(TG_(dumpmap), "sysCpuTime");
+}
+
+
+/* this is called at dump time for every instruction executed */
+static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
+			       InstrInfo* ii, ULong exe_count)
+{
+    if (!TG_(clo).simulate_cache)
+	cost[ fullOffset(EG_IR) ] += exe_count;
+
+    if (ii->eventset)
+	TG_(add_and_zero_cost2)( TG_(sets).full, cost,
+				  ii->eventset, bbcc->cost + ii->cost_offset);
+}
+
+static
+void cachesim_finish(void)
+{
+  if (clo_collect_cacheuse)
+    cacheuse_finish();
+}
+
+/*------------------------------------------------------------*/
+/*--- The simulator defined in this file                   ---*/
+/*------------------------------------------------------------*/
+
+struct cachesim_if TG_(cachesim) = {
+  .print_opts    = cachesim_print_opts,
+  .parse_opt     = cachesim_parse_opt,
+  .post_clo_init = cachesim_post_clo_init,
+  .clear         = cachesim_clear,
+  .dump_desc     = cachesim_dump_desc,
+  .printstat     = cachesim_printstat,
+  .add_icost     = cachesim_add_icost,
+  .finish        = cachesim_finish,
+
+  /* these will be set by cachesim_post_clo_init */
+  .log_1I0D        = 0,
+  .log_2I0D        = 0,
+  .log_3I0D        = 0,
+
+  .log_1I1Dr       = 0,
+  .log_1I1Dw       = 0,
+
+  .log_0I1Dr       = 0,
+  .log_0I1Dw       = 0,
+
+  .log_1I0D_name = "(no function)",
+  .log_2I0D_name = "(no function)",
+  .log_3I0D_name = "(no function)",
+
+  .log_1I1Dr_name = "(no function)",
+  .log_1I1Dw_name = "(no function)",
+
+  .log_0I1Dr_name = "(no function)",
+  .log_0I1Dw_name = "(no function)",
+};
+
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                 ct_sim.c ---*/
+/*--------------------------------------------------------------------*/
diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
new file mode 100644
index 000000000..5351eb577
--- /dev/null
+++ b/tracegrind/tests/Makefile.am
@@ -0,0 +1,3 @@
+dist_noinst_SCRIPTS =
+
+EXTRA_DIST =
diff --git a/tracegrind/threads.c b/tracegrind/threads.c
new file mode 100644
index 000000000..cb311e000
--- /dev/null
+++ b/tracegrind/threads.c
@@ -0,0 +1,456 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                 ct_threads.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#include "pub_tool_threadstate.h"
+
+/* forward decls */
+static exec_state* exec_state_save(void);
+static exec_state* exec_state_restore(void);
+static exec_state* push_exec_state(int);
+static exec_state* top_exec_state(void);
+
+static exec_stack current_states;
+
+
+/*------------------------------------------------------------*/
+/*--- Support for multi-threading                          ---*/
+/*------------------------------------------------------------*/
+
+
+/*
+ * For Valgrind, MT is cooperative (no preemting in our code),
+ * so we don't need locks...
+ *
+ * Per-thread data:
+ *  - BBCCs
+ *  - call stack
+ *  - call hash
+ *  - event counters: last, current
+ *
+ * Even when ignoring MT, we need this functions to set up some
+ * datastructures for the process (= Thread 1).
+ */
+
+/* current running thread */
+ThreadId TG_(current_tid);
+
+static thread_info** thread;
+
+thread_info** TG_(get_threads)(void)
+{
+  return thread;
+}
+
+thread_info* TG_(get_current_thread)(void)
+{
+  return thread[TG_(current_tid)];
+}
+
+void TG_(init_threads)(void)
+{
+    UInt i;
+
+    thread = TG_MALLOC("cl.threads.it.1", VG_N_THREADS * sizeof thread[0]);
+
+    for(i=0;i<VG_N_THREADS;i++)
+	thread[i] = 0;
+    TG_(current_tid) = VG_INVALID_THREADID;
+}
+
+/* switches through all threads and calls func */
+void TG_(forall_threads)(void (*func)(thread_info*))
+{
+  Int t, orig_tid = TG_(current_tid);
+
+  for(t=1;t<VG_N_THREADS;t++) {
+    if (!thread[t]) continue;
+    TG_(switch_thread)(t);
+    (*func)(thread[t]);
+  }
+  TG_(switch_thread)(orig_tid);
+}
+
+
+static
+thread_info* new_thread(void)
+{
+    thread_info* t;
+
+    t = (thread_info*) TG_MALLOC("cl.threads.nt.1",
+                                  sizeof(thread_info));
+
+    /* init state */
+    TG_(init_exec_stack)( &(t->states) );
+    TG_(init_call_stack)( &(t->calls) );
+    TG_(init_fn_stack)  ( &(t->fns) );
+    /* t->states.entry[0]->cxt = TG_(get_cxt)(t->fns.bottom); */
+
+    /* event counters */
+    t->lastdump_cost   = TG_(get_eventset_cost)( TG_(sets).full );
+    t->sighandler_cost = TG_(get_eventset_cost)( TG_(sets).full );
+    TG_(init_cost)( TG_(sets).full, t->lastdump_cost );
+    TG_(init_cost)( TG_(sets).full, t->sighandler_cost );
+
+    /* init data containers */
+    TG_(init_fn_array)( &(t->fn_active) );
+    TG_(init_bbcc_hash)( &(t->bbccs) );
+    TG_(init_jcc_hash)( &(t->jccs) );
+    
+    return t;
+}
+
+
+void TG_(switch_thread)(ThreadId tid)
+{
+  if (tid == TG_(current_tid)) return;
+
+  TG_DEBUG(0, ">> thread %u (was %u)\n", tid, TG_(current_tid));
+
+  if (TG_(current_tid) != VG_INVALID_THREADID) {    
+    /* save thread state */
+    thread_info* t = thread[TG_(current_tid)];
+
+    TG_ASSERT(t != 0);
+
+    /* current context (including signal handler contexts) */
+    exec_state_save();
+    TG_(copy_current_exec_stack)( &(t->states) );
+    TG_(copy_current_call_stack)( &(t->calls) );
+    TG_(copy_current_fn_stack)  ( &(t->fns) );
+
+    TG_(copy_current_fn_array) ( &(t->fn_active) );
+    /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+    if (!TG_(clo).separate_threads) t = thread[1];
+    TG_(copy_current_bbcc_hash)( &(t->bbccs) );
+    TG_(copy_current_jcc_hash) ( &(t->jccs) );
+  }
+
+  TG_(current_tid) = tid;
+  TG_ASSERT(tid < VG_N_THREADS);
+
+  if (tid != VG_INVALID_THREADID) {
+    thread_info* t;
+
+    /* load thread state */
+
+    if (thread[tid] == 0) thread[tid] = new_thread();
+    t = thread[tid];
+
+    /* current context (including signal handler contexts) */
+    TG_(set_current_exec_stack)( &(t->states) );
+    exec_state_restore();
+    TG_(set_current_call_stack)( &(t->calls) );
+    TG_(set_current_fn_stack)  ( &(t->fns) );
+    
+    TG_(set_current_fn_array)  ( &(t->fn_active) );
+    /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+    if (!TG_(clo).separate_threads) t = thread[1];
+    TG_(set_current_bbcc_hash) ( &(t->bbccs) );
+    TG_(set_current_jcc_hash)  ( &(t->jccs) );
+  }
+}
+
+
+void TG_(run_thread)(ThreadId tid)
+{
+    /* check for dumps needed */
+    static ULong bbs_done = 0;
+    HChar buf[50];   // large enough
+
+    if (TG_(clo).dump_every_bb >0) {
+       if (TG_(stat).bb_executions - bbs_done > TG_(clo).dump_every_bb) {
+           VG_(sprintf)(buf, "--dump-every-bb=%llu", TG_(clo).dump_every_bb);
+	   TG_(dump_profile)(buf, False);
+           bbs_done = TG_(stat).bb_executions;
+       }
+    }
+
+    /* now check for thread switch */
+    TG_(switch_thread)(tid);
+}
+
+void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack)
+{
+    exec_state *es;
+
+    TG_DEBUG(0, ">> pre_signal(TID %u, sig %d, alt_st %s)\n",
+	     tid, sigNum, alt_stack ? "yes":"no");
+
+    /* switch to the thread the handler runs in */
+    TG_(switch_thread)(tid);
+
+    /* save current execution state */
+    exec_state_save();
+
+    /* setup new cxtinfo struct for this signal handler */
+    es = push_exec_state(sigNum);
+    TG_(zero_cost)( TG_(sets).full, es->cost );
+    TG_(current_state).cost = es->cost;
+    es->call_stack_bottom = TG_(current_call_stack).sp;
+
+    /* setup current state for a spontaneous call */
+    TG_(init_exec_state)( &TG_(current_state) );
+    TG_(current_state).sig = sigNum;
+    TG_(push_cxt)(0);
+}
+
+/* Run post-signal if the stackpointer for call stack is at
+ * the bottom in current exec state (e.g. a signal handler)
+ *
+ * Called from TG_(pop_call_stack)
+ */
+void TG_(run_post_signal_on_call_stack_bottom)(void)
+{
+    exec_state* es = top_exec_state();
+    TG_ASSERT(es != 0);
+    TG_ASSERT(TG_(current_state).sig >0);
+
+    if (TG_(current_call_stack).sp == es->call_stack_bottom)
+	TG_(post_signal)( TG_(current_tid), TG_(current_state).sig );
+}
+
+void TG_(post_signal)(ThreadId tid, Int sigNum)
+{
+    exec_state* es;
+    UInt fn_number, *pactive;
+
+    TG_DEBUG(0, ">> post_signal(TID %u, sig %d)\n",
+	     tid, sigNum);
+
+    /* thread switching potentially needed, eg. with instrumentation off */
+    TG_(switch_thread)(tid);
+    TG_ASSERT(sigNum == TG_(current_state).sig);
+
+    /* Unwind call stack of this signal handler.
+     * This should only be needed at finalisation time
+     */
+    es = top_exec_state();
+    TG_ASSERT(es != 0);
+    while(TG_(current_call_stack).sp > es->call_stack_bottom)
+      TG_(pop_call_stack)();
+    
+    if (TG_(current_state).cxt) {
+      /* correct active counts */
+      fn_number = TG_(current_state).cxt->fn[0]->number;
+      pactive = TG_(get_fn_entry)(fn_number);
+      (*pactive)--;
+      TG_DEBUG(0, "  set active count of %s back to %u\n",
+	       TG_(current_state).cxt->fn[0]->name, *pactive);
+    }
+
+    if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) {
+	/* set fn_stack_top back.
+	 * top can point to 0 if nothing was executed in the signal handler;
+	 * this is possible at end on unwinding handlers.
+	 */
+	if (*(TG_(current_fn_stack).top) != 0) {
+	    TG_(current_fn_stack).top--;
+	    TG_ASSERT(*(TG_(current_fn_stack).top) == 0);
+	}
+      if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom)
+	TG_(current_fn_stack).top--;
+    }
+
+    /* sum up costs */
+    TG_ASSERT(TG_(current_state).cost == es->cost);
+    TG_(add_and_zero_cost)( TG_(sets).full,
+			    thread[TG_(current_tid)]->sighandler_cost,
+			    TG_(current_state).cost );
+    
+    /* restore previous context */
+    es->sig = -1;
+    current_states.sp--;
+    es = top_exec_state();
+    TG_(current_state).sig = es->sig;
+    exec_state_restore();
+
+    /* There is no way to reliable get the thread ID we are switching to
+     * after this handler returns. So we sync with actual TID at start of
+     * TG_(setup_bb)(), which should be the next for tracegrind.
+     */
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- Execution states in a thread & signal handlers       ---*/
+/*------------------------------------------------------------*/
+
+/* Each thread can be interrupted by a signal handler, and they
+ * themselves again. But as there's no scheduling among handlers
+ * of the same thread, we don't need additional stacks.
+ * So storing execution contexts and
+ * adding separators in the callstack(needed to not intermix normal/handler
+ * functions in contexts) should be enough.
+ */
+
+/* not initialized: call_stack_bottom, sig */
+void TG_(init_exec_state)(exec_state* es)
+{
+  es->collect = TG_(clo).collect_atstart;
+  es->cxt  = 0;
+  es->jmps_passed = 0;
+  es->bbcc = 0;
+  es->nonskipped = 0;
+}
+
+
+static exec_state* new_exec_state(Int sigNum)
+{
+    exec_state* es;
+    es = (exec_state*) TG_MALLOC("cl.threads.nes.1",
+                                  sizeof(exec_state));
+
+    /* allocate real cost space: needed as incremented by
+     * simulation functions */
+    es->cost       = TG_(get_eventset_cost)(TG_(sets).full);
+    TG_(zero_cost)( TG_(sets).full, es->cost );
+    TG_(init_exec_state)(es);
+    es->sig        = sigNum;
+    es->call_stack_bottom  = 0;
+
+    return es;
+}
+
+void TG_(init_exec_stack)(exec_stack* es)
+{
+  Int i;
+
+  /* The first element is for the main thread */
+  es->entry[0] = new_exec_state(0);
+  for(i=1;i<MAX_SIGHANDLERS;i++)
+    es->entry[i] = 0;
+  es->sp = 0;
+}
+
+void TG_(copy_current_exec_stack)(exec_stack* dst)
+{
+  Int i;
+
+  dst->sp = current_states.sp;
+  for(i=0;i<MAX_SIGHANDLERS;i++)
+    dst->entry[i] = current_states.entry[i];
+}
+
+void TG_(set_current_exec_stack)(exec_stack* dst)
+{
+  Int i;
+
+  current_states.sp = dst->sp;
+  for(i=0;i<MAX_SIGHANDLERS;i++)
+    current_states.entry[i] = dst->entry[i];
+}
+
+
+/* Get top context info struct of current thread */
+static
+exec_state* top_exec_state(void)
+{
+  Int sp = current_states.sp;
+  exec_state* es;
+
+  TG_ASSERT((sp >= 0) && (sp < MAX_SIGHANDLERS));
+  es = current_states.entry[sp];
+  TG_ASSERT(es != 0);
+  return es;
+}
+
+/* Allocates a free context info structure for a new entered
+ * signal handler, putting it on the context stack.
+ * Returns a pointer to the structure.
+ */
+static exec_state* push_exec_state(int sigNum)
+{   
+  Int sp;
+  exec_state* es;
+
+  current_states.sp++;
+  sp = current_states.sp;
+
+  TG_ASSERT((sigNum > 0) && (sigNum <= _VKI_NSIG));
+  TG_ASSERT((sp > 0) && (sp < MAX_SIGHANDLERS));
+  es = current_states.entry[sp];
+  if (!es) {
+    es = new_exec_state(sigNum);
+    current_states.entry[sp] = es;
+  }
+  else
+    es->sig = sigNum;
+
+  return es;
+}
+
+/* Save current context to top cxtinfo struct */
+static
+exec_state* exec_state_save(void)
+{
+  exec_state* es = top_exec_state();
+
+  es->cxt         = TG_(current_state).cxt;
+  es->collect     = TG_(current_state).collect;
+  es->jmps_passed = TG_(current_state).jmps_passed;
+  es->bbcc        = TG_(current_state).bbcc;
+  es->nonskipped  = TG_(current_state).nonskipped;
+  TG_ASSERT(es->cost == TG_(current_state).cost);
+
+  TG_DEBUGIF(1) {
+    TG_DEBUG(1, "  cxtinfo_save(sig %d): collect %s, jmps_passed %d\n",
+	     es->sig, es->collect ? "Yes": "No", es->jmps_passed);	
+    TG_(print_bbcc)(-9, es->bbcc);
+    TG_(print_cost)(-9, TG_(sets).full, es->cost);
+  }
+
+  /* signal number does not need to be saved */
+  TG_ASSERT(TG_(current_state).sig == es->sig);
+
+  return es;
+}
+
+static
+exec_state* exec_state_restore(void)
+{
+  exec_state* es = top_exec_state();
+  
+  TG_(current_state).cxt     = es->cxt;
+  TG_(current_state).collect = es->collect;
+  TG_(current_state).jmps_passed = es->jmps_passed;
+  TG_(current_state).bbcc    = es->bbcc;
+  TG_(current_state).nonskipped = es->nonskipped;
+  TG_(current_state).cost    = es->cost;
+  TG_(current_state).sig     = es->sig;
+    
+  TG_DEBUGIF(1) {
+	TG_DEBUG(1, "  exec_state_restore(sig %d): collect %s, jmps_passed %d\n",
+		  es->sig, es->collect ? "Yes": "No", es->jmps_passed);
+	TG_(print_bbcc)(-9, es->bbcc);
+	TG_(print_cxt)(-9, es->cxt, 0);
+	TG_(print_cost)(-9, TG_(sets).full, es->cost);
+  }
+
+  return es;
+}
diff --git a/tracegrind/tracegrind.h b/tracegrind/tracegrind.h
new file mode 100644
index 000000000..20b683fbf
--- /dev/null
+++ b/tracegrind/tracegrind.h
@@ -0,0 +1,131 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (tracegrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 3, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of tracegrind, a valgrind tool for cache simulation
+   and streaming CSV trace output.
+
+   Based on callgrind, Copyright (C) 2003-2017 Josef Weidendorfer.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must
+      not claim that you wrote the original software.  If you use this
+      software in a product, an acknowledgment in the product
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (tracegrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 3.  See the
+   COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+*/
+
+#ifndef __TRACEGRIND_H
+#define __TRACEGRIND_H
+
+#include "valgrind.h"
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !!
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end.
+
+   The identification ('C','T') for Tracegrind has historical
+   reasons: it was called "Calltree" before. Besides, ('C','G') would
+   clash with cachegrind. We keep ('C','T') for compatibility with
+   callgrind client request macros.
+ */
+
+typedef
+   enum {
+      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'),
+      VG_USERREQ__ZERO_STATS,
+      VG_USERREQ__TOGGLE_COLLECT,
+      VG_USERREQ__DUMP_STATS_AT,
+      VG_USERREQ__START_INSTRUMENTATION,
+      VG_USERREQ__STOP_INSTRUMENTATION
+   } Vg_TracegrindClientRequest;
+
+/* Dump current state of cost centers, and zero them afterwards */
+#define TRACEGRIND_DUMP_STATS                                    \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS,       \
+                                  0, 0, 0, 0, 0)
+
+/* Dump current state of cost centers, and zero them afterwards.
+   The argument is appended to a string stating the reason which triggered
+   the dump. This string is written as a description field into the
+   profile data dump. */
+#define TRACEGRIND_DUMP_STATS_AT(pos_str)                        \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS_AT,    \
+                                  pos_str, 0, 0, 0, 0)
+
+/* Zero cost centers */
+#define TRACEGRIND_ZERO_STATS                                    \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ZERO_STATS,       \
+                                  0, 0, 0, 0, 0)
+
+/* Toggles collection state.
+   The collection state specifies whether the happening of events
+   should be noted or if they are to be ignored. Events are noted
+   by increment of counters in a cost center */
+#define TRACEGRIND_TOGGLE_COLLECT                                \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT,   \
+                                  0, 0, 0, 0, 0)
+
+/* Start full tracegrind instrumentation if not already switched on.
+   When cache simulation is done, it will flush the simulated cache;
+   this will lead to an artificial cache warmup phase afterwards with
+   cache misses which would not have happened in reality. */
+#define TRACEGRIND_START_INSTRUMENTATION                              \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, \
+                                  0, 0, 0, 0, 0)
+
+/* Stop full tracegrind instrumentation if not already switched off.
+   This flushes Valgrinds translation cache, and does no additional
+   instrumentation afterwards, which effectivly will run at the same
+   speed as the "none" tool (ie. at minimal slowdown).
+   Use this to bypass Tracegrind aggregation for uninteresting code parts.
+   To start Tracegrind in this mode to ignore the setup phase, use
+   the option "--instr-atstart=no". */
+#define TRACEGRIND_STOP_INSTRUMENTATION                               \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION,  \
+                                  0, 0, 0, 0, 0)
+
+#endif /* __TRACEGRIND_H */

From a8f5a9ef583663e39c6ec2f59ddff95eabb50d68 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Wed, 4 Feb 2026 23:39:30 +0000
Subject: [PATCH 02/26] feat: replace callgraph dump with streaming CSV trace
 output

Replace callgrind's accumulated callgraph output with streaming CSV
trace data emitted at function ENTER/EXIT boundaries. Each row contains
delta counters since the last sample, enabling per-call cost attribution.

Key changes:
- dump.c: Replace callgraph output with CSV trace (trace_open/emit/close)
- callstack.c: Hook push/pop_call_stack to emit ENTER/EXIT samples
- threads.c: Add per-thread last_sample_cost for delta tracking
- global.h: Add trace_output struct and per-thread sample state
- main.c: Open trace at init, close at fini, update copyright

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .gitignore             |   27 +
 tracegrind/callstack.c |   11 +
 tracegrind/dump.c      | 1811 +++++-----------------------------------
 tracegrind/global.h    |   21 +
 tracegrind/main.c      |   11 +-
 tracegrind/threads.c   |    3 +
 6 files changed, 294 insertions(+), 1590 deletions(-)

diff --git a/.gitignore b/.gitignore
index ea71bb0aa..6a2f18e30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@
 /autom4te.cache
 /bin
 /cachegrind.out.*
+/callgrind.out.*
+/tracegrind.out.*
 /compile
 /config.guess
 /config.h*
@@ -17,6 +19,7 @@
 /config.status
 /config.sub
 /configure
+/configure~
 /default.supp
 /depcomp
 /glibc-2.X.supp
@@ -161,6 +164,30 @@
 /callgrind/tests/inline-samefile
 /callgrind/tests/inline-crossfile
 
+# /tracegrind/
+/tracegrind/*.so
+/tracegrind/.deps
+/tracegrind/tracegrind-*-darwin
+/tracegrind/tracegrind-*-linux
+/tracegrind/tracegrind-*-solaris
+/tracegrind/tracegrind-*-freebsd
+/tracegrind/Makefile
+/tracegrind/Makefile.in
+
+# /tracegrind/tests/
+/tracegrind/tests/*.dSYM
+/tracegrind/tests/*.post.diff*
+/tracegrind/tests/*.post.out
+/tracegrind/tests/*.stderr.diff*
+/tracegrind/tests/*.stderr.out
+/tracegrind/tests/*.stdout.diff*
+/tracegrind/tests/*.stdout.out
+/tracegrind/tests/.deps
+/tracegrind/tests/Makefile
+/tracegrind/tests/Makefile.in
+/tracegrind/tests/tracegrind.out.*
+/tracegrind/tests/fibo
+
 # /coregrind/
 /coregrind/*.a
 /coregrind/*.dSYM
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index d80669174..80480eec4 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -250,6 +250,12 @@ void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
 
     TG_(current_call_stack).sp++;
 
+    /* Emit CSV trace sample on function entry */
+    if (!skip && TG_(current_state).collect) {
+	fn_node* to_fn = to->cxt->fn[0];
+	TG_(trace_emit_sample)(TG_(current_tid), "ENTER", to_fn);
+    }
+
     /* To allow for above assertion we set context of next frame to 0 */
     TG_ASSERT(TG_(current_call_stack).sp < TG_(current_call_stack).size);
     current_entry++;
@@ -353,6 +359,11 @@ void TG_(pop_call_stack)(void)
 	}
 	TG_(stat).ret_counter++;
 
+	/* Emit CSV trace sample on function exit */
+	if (TG_(current_state).collect) {
+	    TG_(trace_emit_sample)(TG_(current_tid), "EXIT", to_fn);
+	}
+
 	/* restore context */
 	TG_(current_state).cxt  = lower_entry->cxt;
 	TG_(current_fn_stack).top =
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index cd602e7fd..c78c0f520 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -6,7 +6,7 @@
 /*
    This file is part of Tracegrind, a Valgrind tool for call tracing.
 
-   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+   Based on callgrind, Copyright (C) 2002-2017, Josef Weidendorfer.
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
@@ -30,21 +30,16 @@
 #include "pub_tool_threadstate.h"
 #include "pub_tool_libcfile.h"
 
+/* ================================================================== */
+/* === Legacy dump state (kept for totals verification)           === */
+/* ================================================================== */
 
-/* Dump Part Counter */
 static Int out_counter = 0;
-
 static HChar* out_file = 0;
 static Bool dumps_initialized = False;
 
-/* Command */
-static HChar *cmdbuf;
-
-/* Total reads/writes/misses sum over all dumps and threads.
- * Updated during CC traversal at dump time.
- */
+/* Total reads/writes/misses sum over all dumps and threads. */
 FullCost TG_(total_cost) = 0;
-static FullCost dump_total_cost = 0;
 
 EventMapping* TG_(dumpmap) = 0;
 
@@ -53,1624 +48,268 @@ Int TG_(get_dump_counter)(void)
   return out_counter;
 }
 
-/*------------------------------------------------------------*/
-/*--- Output file related stuff                            ---*/
-/*------------------------------------------------------------*/
-
-/* Boolean dumping array */
-static Bool* dump_array = 0;
-static Int   dump_array_size = 0;
-static Bool* obj_dumped = 0;
-static Bool* file_dumped = 0;
-static Bool* fn_dumped = 0;
-static Bool* cxt_dumped = 0;
-
-static
-void reset_dump_array(void)
-{
-    int i;
-
-    TG_ASSERT(dump_array != 0);
-
-    for(i=0;i<dump_array_size;i++)
-	dump_array[i] = False;
-}
-
-static
-void init_dump_array(void)
-{
-    dump_array_size = TG_(stat).distinct_objs +
-      TG_(stat).distinct_files +
-      TG_(stat).distinct_fns +
-      TG_(stat).context_counter;
-    TG_ASSERT(dump_array == 0);
-    dump_array = (Bool*) TG_MALLOC("cl.dump.ida.1",
-                                    dump_array_size * sizeof(Bool));
-    obj_dumped  = dump_array;
-    file_dumped = obj_dumped + TG_(stat).distinct_objs;
-    fn_dumped   = file_dumped + TG_(stat).distinct_files;
-    cxt_dumped  = fn_dumped + TG_(stat).distinct_fns;
-
-    reset_dump_array();
-
-    TG_DEBUG(1, "  init_dump_array: size %d\n", dump_array_size);
-}
-
-static __inline__
-void free_dump_array(void)
-{
-    TG_ASSERT(dump_array != 0);
-    VG_(free)(dump_array);
-
-    dump_array = 0;
-    obj_dumped = 0;
-    file_dumped = 0;
-    fn_dumped = 0;
-    cxt_dumped = 0;
-}
-
-
-/* Initialize to an invalid position */
-static __inline__
-void init_fpos(FnPos* p)
- {
-    p->file = 0;
-    p->fn = 0;
-    p->obj = 0;
-    p->cxt = 0;
-    p->rec_index = 0;
-}
-
-
-static void print_obj(VgFile *fp, const HChar* prefix, obj_node* obj)
-{
-    if (TG_(clo).compress_strings) {
-	TG_ASSERT(obj_dumped != 0);
-	if (obj_dumped[obj->number])
-            VG_(fprintf)(fp, "%s(%u)\n", prefix, obj->number);
-	else {
-            VG_(fprintf)(fp, "%s(%u) %s\n", prefix, obj->number, obj->name);
-	}
-    }
-    else
-        VG_(fprintf)(fp, "%s%s\n", prefix, obj->name);
-
-#if 0
-    /* add mapping parameters the first time a object is dumped
-     * format: mp=0xSTART SIZE 0xOFFSET */
-    if (!obj_dumped[obj->number]) {
-	obj_dumped[obj->number];
-	VG_(fprintf)(fp, "mp=%p %p %p\n",
-		     pos->obj->start, pos->obj->size, pos->obj->offset);
-    }
-#else
-    obj_dumped[obj->number] = True;
-#endif
-}
-
-static void print_file(VgFile *fp, const char *prefix, const file_node* file)
-{
-    if (TG_(clo).compress_strings) {
-	TG_ASSERT(file_dumped != 0);
-	if (file_dumped[file->number])
-            VG_(fprintf)(fp, "%s(%u)\n", prefix, file->number);
-	else {
-            VG_(fprintf)(fp, "%s(%u) %s\n", prefix, file->number, file->name);
-	    file_dumped[file->number] = True;
-	}
-    }
-    else
-        VG_(fprintf)(fp, "%s%s\n", prefix, file->name);
-}
-
-/*
- * tag can be "fn", "cfn", "jfn"
- */
-static void print_fn(VgFile *fp, const HChar* tag, const fn_node* fn)
-{
-    VG_(fprintf)(fp, "%s=",tag);
-    if (TG_(clo).compress_strings) {
-	TG_ASSERT(fn_dumped != 0);
-	if (fn_dumped[fn->number])
-	    VG_(fprintf)(fp, "(%u)\n", fn->number);
-	else {
-	    VG_(fprintf)(fp, "(%u) %s\n", fn->number, fn->name);
-	    fn_dumped[fn->number] = True;
-	}
-    }
-    else
-        VG_(fprintf)(fp, "%s\n", fn->name);
-}
-
-static void print_mangled_fn(VgFile *fp, const HChar* tag, 
-			     Context* cxt, int rec_index)
-{
-    int i;
-
-    if (TG_(clo).compress_strings && TG_(clo).compress_mangled) {
-
-	int n;
-	Context* last;
-
-	TG_ASSERT(cxt_dumped != 0);
-	if (cxt_dumped[cxt->base_number+rec_index]) {
-            VG_(fprintf)(fp, "%s=(%u)\n",
-			     tag, cxt->base_number + rec_index);
-	    return;
-	}
-
-	last = 0;
-	/* make sure that for all context parts compressed data is written */
-	for(i=cxt->size;i>0;i--) {
-	    TG_ASSERT(cxt->fn[i-1]->pure_cxt != 0);
-	    n = cxt->fn[i-1]->pure_cxt->base_number;
-	    if (cxt_dumped[n]) continue;
-	    VG_(fprintf)(fp, "%s=(%d) %s\n",
-			     tag, n, cxt->fn[i-1]->name);
-
-	    cxt_dumped[n] = True;
-	    last = cxt->fn[i-1]->pure_cxt;
-	}
-	/* If the last context was the context to print, we are finished */
-	if ((last == cxt) && (rec_index == 0)) return;
-
-	VG_(fprintf)(fp, "%s=(%u) (%u)", tag,
-			 cxt->base_number + rec_index,
-			 cxt->fn[0]->pure_cxt->base_number);
-	if (rec_index >0)
-	    VG_(fprintf)(fp, "'%d", rec_index +1);
-	for(i=1;i<cxt->size;i++)
-	    VG_(fprintf)(fp, "'(%u)", 
-			      cxt->fn[i]->pure_cxt->base_number);
-	VG_(fprintf)(fp, "\n");
-
-	cxt_dumped[cxt->base_number+rec_index] = True;
-	return;
-    }
-
-
-    VG_(fprintf)(fp, "%s=", tag);
-    if (TG_(clo).compress_strings) {
-	TG_ASSERT(cxt_dumped != 0);
-	if (cxt_dumped[cxt->base_number+rec_index]) {
-	    VG_(fprintf)(fp, "(%u)\n", cxt->base_number + rec_index);
-	    return;
-	}
-	else {
-	    VG_(fprintf)(fp, "(%u) ", cxt->base_number + rec_index);
-	    cxt_dumped[cxt->base_number+rec_index] = True;
-	}
-    }
-
-    VG_(fprintf)(fp, "%s", cxt->fn[0]->name);
-    if (rec_index >0)
-	VG_(fprintf)(fp, "'%d", rec_index +1);
-    for(i=1;i<cxt->size;i++)
-	VG_(fprintf)(fp, "'%s", cxt->fn[i]->name);
-
-    VG_(fprintf)(fp, "\n");
-}
-
-
-
-/**
- * Print function position of the BBCC, but only print info differing to
- * the <last> position, update <last>
- * Return True if something changes.
- */
-static Bool print_fn_pos(VgFile *fp, FnPos* last, BBCC* bbcc)
-{
-    Bool res = False;
-
-    TG_ASSERT(bbcc && bbcc->cxt);
-
-    TG_DEBUGIF(3) {
-	TG_DEBUG(2, "+ print_fn_pos: ");
-	TG_(print_cxt)(16, bbcc->cxt, bbcc->rec_index);
-    }
-
-    if (!TG_(clo).mangle_names) {
-	if (last->rec_index != bbcc->rec_index) {
-	    VG_(fprintf)(fp, "rec=%u\n\n", bbcc->rec_index);
-	    last->rec_index = bbcc->rec_index;
-	    last->cxt = 0; /* reprint context */
-	    res = True;
-	}
-	
-	if (last->cxt != bbcc->cxt) {
-	    fn_node* last_from = (last->cxt && last->cxt->size >1) ?
-				 last->cxt->fn[1] : 0;
-	    fn_node* curr_from = (bbcc->cxt->size >1) ?
-				 bbcc->cxt->fn[1] : 0;
-	    if (curr_from == 0) {
-		if (last_from != 0) {
-		    /* switch back to no context */
-		    VG_(fprintf)(fp, "frfn=(spontaneous)\n");
-		    res = True;
-		}
-	    }
-	    else if (last_from != curr_from) {
-		print_fn(fp, "frfn", curr_from);
-		res = True;
-	    }
-	    last->cxt = bbcc->cxt;
-	}
-    }
-
-    if (last->obj != bbcc->cxt->fn[0]->file->obj) {
-	print_obj(fp, "ob=", bbcc->cxt->fn[0]->file->obj);
-	last->obj = bbcc->cxt->fn[0]->file->obj;
-	res = True;
-    }
-
-    if (last->file != bbcc->cxt->fn[0]->file) {
-        print_file(fp, "fl=", bbcc->cxt->fn[0]->file);
-	last->file = bbcc->cxt->fn[0]->file;
-	res = True;
-    }
-
-    if (!TG_(clo).mangle_names) {
-	if (last->fn != bbcc->cxt->fn[0]) {
-	    print_fn(fp, "fn", bbcc->cxt->fn[0]);
-	    last->fn = bbcc->cxt->fn[0];
-	    res = True;
-	}
-    }
-    else {
-	/* Print mangled name if context or rec_index changes */
-	if ((last->rec_index != bbcc->rec_index) ||
-	    (last->cxt != bbcc->cxt)) {
-
-	    print_mangled_fn(fp, "fn", bbcc->cxt, bbcc->rec_index);
-	    last->fn = bbcc->cxt->fn[0];
-	    last->rec_index = bbcc->rec_index;
-	    res = True;
-	}
-    }
-
-    last->cxt = bbcc->cxt;
-
-    TG_DEBUG(2, "- print_fn_pos: %s\n", res ? "changed" : "");
-    
-    return res;
-}
-
-/* the debug lookup cache is useful if BBCC for same BB are
- * dumped directly in a row. This is a direct mapped cache.
- */
-#define DEBUG_CACHE_SIZE 1777
-
-static Addr       debug_cache_addr[DEBUG_CACHE_SIZE];
-static file_node* debug_cache_file[DEBUG_CACHE_SIZE];
-static int        debug_cache_line[DEBUG_CACHE_SIZE];
-static Bool       debug_cache_info[DEBUG_CACHE_SIZE];
-static const HChar* debug_cache_inlfn[DEBUG_CACHE_SIZE];
-
-static __inline__
-void init_debug_cache(void)
-{
-    int i;
-    for(i=0;i<DEBUG_CACHE_SIZE;i++) {
-	debug_cache_addr[i] = 0;
-	debug_cache_file[i] = 0;
-	debug_cache_line[i] = 0;
-	debug_cache_info[i] = 0;
-	debug_cache_inlfn[i] = 0;
-    }
-}
-
-static /* __inline__ */
-Bool get_debug_pos(BBCC* bbcc, Addr addr, AddrPos* p)
-{
-    const HChar *file, *dir;
-    Bool found_file_line;
-
-    int cachepos = addr % DEBUG_CACHE_SIZE;
-    
-    if (debug_cache_addr[cachepos] == addr) {
-	p->line = debug_cache_line[cachepos];
-	p->file = debug_cache_file[cachepos];
-	found_file_line = debug_cache_info[cachepos];
-    }
-    else {
-        DiEpoch ep = VG_(current_DiEpoch)();
-	found_file_line = VG_(get_filename_linenum)(ep, addr,
-						    &file,
-						    &dir,
-						    &(p->line));
-	if (!found_file_line) {
-            file = "???";
-	    p->line = 0;
-	}
-	p->file    = TG_(get_file_node)(bbcc->bb->obj, dir, file);
-
-	debug_cache_info[cachepos] = found_file_line;
-	debug_cache_addr[cachepos] = addr;
-	debug_cache_line[cachepos] = p->line;
-	debug_cache_file[cachepos] = p->file;
-
-	/* Query inline info at the same time we query file/line */
-	const HChar* inl_fn = 0;
-	Bool has_inline = VG_(get_inline_fnname)(ep, addr, &inl_fn);
-	if (has_inline) {
-	    debug_cache_inlfn[cachepos] = inl_fn;
-	} else {
-	    debug_cache_inlfn[cachepos] = (const HChar*)(-1);
-	}
-    }
-
-    /* Address offset from bbcc start address */
-    p->addr = addr - bbcc->bb->obj->offset;
-    p->bb_addr = bbcc->bb->offset;
-
-    TG_DEBUG(3, "  get_debug_pos(%#lx): BB %#lx, fn '%s', file '%s', line %u\n",
-	     addr, bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name,
-	     p->file->name, p->line);
-
-    return found_file_line;
-}
-
-/* Get inline function name for an address, with caching.
- * Returns True if address is in an inlined function, False otherwise.
- * If True, *inl_fn will be set to the inline function name.
- */
-static Bool get_inline_info(Addr addr, const HChar** inl_fn)
-{
-    int cachepos = addr % DEBUG_CACHE_SIZE;
-
-    /* Check cache first - but only if inline info was already queried for this address */
-    if (debug_cache_addr[cachepos] == addr && debug_cache_inlfn[cachepos] != 0) {
-        /* We have cached inline info for this address */
-        if (debug_cache_inlfn[cachepos] == (const HChar*)(-1)) {
-            /* Special marker: no inline function at this address */
-            *inl_fn = 0;
-            return False;
-        }
-        *inl_fn = debug_cache_inlfn[cachepos];
-        return True;
-    }
-
-    DiEpoch ep = VG_(current_DiEpoch)();
-    Bool has_inline = VG_(get_inline_fnname)(ep, addr, inl_fn);
-
-    if (has_inline) {
-        /* Cache the inline function name */
-        debug_cache_inlfn[cachepos] = *inl_fn;
-    } else {
-        *inl_fn = 0;
-        /* Use special marker -1 to indicate "no inline function" */
-        debug_cache_inlfn[cachepos] = (const HChar*)(-1);
-    }
-
-    TG_DEBUG(3, "  get_inline_info(%#lx): %s\n",
-             addr, has_inline ? *inl_fn : "(not inlined)");
-
-    return has_inline;
-}
-
+/* ================================================================== */
+/* === CSV trace output                                           === */
+/* ================================================================== */
 
-/* copy file position and init cost */
-static void init_apos(AddrPos* p, Addr addr, Addr bbaddr, file_node* file)
-{
-    p->addr    = addr;
-    p->bb_addr = bbaddr;
-    p->file    = file;
-    p->line    = 0;
-}
+trace_output TG_(trace_out) = { .fd = -1, .seq = 0,
+                                .initialized = False,
+                                .header_written = False };
 
-static void copy_apos(AddrPos* dst, AddrPos* src)
-{
-    dst->addr    = src->addr;
-    dst->bb_addr = src->bb_addr;
-    dst->file    = src->file;
-    dst->line    = src->line;
-}   
-
-/* copy file position and init cost */
-static void init_fcost(AddrCost* c, Addr addr, Addr bbaddr, file_node* file)
+/* Write a string to the trace output fd */
+static void trace_write(const HChar* buf, Int len)
 {
-    init_apos( &(c->p), addr, bbaddr, file);
-    /* FIXME: This is a memory leak as a AddrCost is inited multiple times */
-    c->cost = TG_(get_eventset_cost)( TG_(sets).full );
-    TG_(init_cost)( TG_(sets).full, c->cost );
+    if (TG_(trace_out).fd < 0) return;
+    VG_(write)(TG_(trace_out).fd, buf, len);
 }
 
-/* Track last inline function to avoid repeated cfni= output */
-static const HChar* last_inline_fn = 0;
-
-/**
- * print position change inside of a BB (last -> curr)
- * this doesn't update last to curr!
+/* Escape a string for CSV: if it contains comma, quote, or newline,
+ * wrap in quotes and double any quotes.  Otherwise just copy.
+ * Writes to buf, returns chars written. buf must be large enough.
  */
-static void fprint_apos(VgFile *fp, AddrPos* curr, AddrPos* last,
-                        file_node* func_file, BBCC* bbcc)
+static Int csv_escape(HChar* buf, Int bufsize, const HChar* src)
 {
-    TG_ASSERT(curr->file != 0);
-    TG_DEBUG(2, "    print_apos(file '%s', line %u, bb %#lx, addr %#lx) fnFile '%s'\n",
-	     curr->file->name, curr->line, curr->bb_addr, curr->addr,
-	     func_file->name);
-
-    if (curr->file != last->file) {
-
-	/* if we switch back to orig file, use fe=... */
-	if (curr->file == func_file)
-            print_file(fp, "fe=", curr->file);
-	else
-            print_file(fp, "fi=", curr->file);
-    }
+    Bool needs_quote = False;
+    const HChar* p;
+    Int i;
 
-    /* Check inline function for this position and output cfni= if changed */
-    if (bbcc) {
-        Addr curr_addr = curr->addr + bbcc->bb->obj->offset;
-        const HChar* inline_fn = 0;
-        Bool is_inline = get_inline_info(curr_addr, &inline_fn);
-
-        /* Output cfni= if inline function changed */
-        if (is_inline && inline_fn && inline_fn != last_inline_fn) {
-            VG_(fprintf)(fp, "cfni=%s\n", inline_fn);
-            last_inline_fn = inline_fn;
-        }
-        /* Clear last_inline_fn if we're no longer in inline code */
-        else if (!is_inline && last_inline_fn) {
-            VG_(fprintf)(fp, "cfni=???\n");
-            last_inline_fn = 0;
+    for (p = src; *p; p++) {
+        if (*p == ',' || *p == '"' || *p == '\n') {
+            needs_quote = True;
+            break;
         }
     }
 
-    if (TG_(clo).dump_bbs) {
-	if (curr->line != last->line) {
-	    VG_(fprintf)(fp, "ln=%u\n", curr->line);
-	}
+    if (!needs_quote) {
+        i = 0;
+        for (p = src; *p && i < bufsize - 1; p++, i++)
+            buf[i] = *p;
+        buf[i] = '\0';
+        return i;
     }
-}
-
-
-
-/**
- * Print a position.
- * This prints out differences if allowed
- *
- * This doesn't set last to curr afterwards!
- */
-static
-void fprint_pos(VgFile *fp, const AddrPos* curr, const AddrPos* last)
-{
-    if (0) //TG_(clo).dump_bbs)
-	VG_(fprintf)(fp, "%lu ", curr->addr - curr->bb_addr);
-    else {
-	if (TG_(clo).dump_instr) {
-	    int diff = curr->addr - last->addr;
-	    if ( TG_(clo).compress_pos && (last->addr >0) && 
-		 (diff > -100) && (diff < 100)) {
-		if (diff >0)
-		    VG_(fprintf)(fp, "+%d ", diff);
-		else if (diff==0)
-		    VG_(fprintf)(fp, "* ");
-	        else
-		    VG_(fprintf)(fp, "%d ", diff);
-	    }
-	    else
-		VG_(fprintf)(fp, "%#lx ", curr->addr);
-	}
-
-	if (TG_(clo).dump_bb) {
-	    int diff = curr->bb_addr - last->bb_addr;
-	    if ( TG_(clo).compress_pos && (last->bb_addr >0) && 
-		 (diff > -100) && (diff < 100)) {
-		if (diff >0)
-		    VG_(fprintf)(fp, "+%d ", diff);
-		else if (diff==0)
-		    VG_(fprintf)(fp, "* ");
-	        else
-		    VG_(fprintf)(fp, "%d ", diff);
-	    }
-	    else
-		VG_(fprintf)(fp, "%#lx ", curr->bb_addr);
-	}
-
-	if (TG_(clo).dump_line) {
-	    int diff = curr->line - last->line;
-	    if ( TG_(clo).compress_pos && (last->line >0) && 
-		 (diff > -100) && (diff < 100)) {
-
-		if (diff >0)
-		    VG_(fprintf)(fp, "+%d ", diff);
-		else if (diff==0)
-		    VG_(fprintf)(fp, "* ");
-	        else
-		    VG_(fprintf)(fp, "%d ", diff);
-	    }
-	    else
-		VG_(fprintf)(fp, "%u ", curr->line);
-	}
-    }
-}
-
-
-/**
- * Print events.
- */
-
-static
-void fprint_cost(VgFile *fp, const EventMapping* es, const ULong* cost)
-{
-  HChar *mcost = TG_(mappingcost_as_string)(es, cost);
-  VG_(fprintf)(fp, "%s\n", mcost);
-  TG_FREE(mcost);
-}
-
-
-
-/* Write the cost of a source line; only that parts of the source
- * position are written that changed relative to last written position.
- * funcPos is the source position of the first line of actual function.
- * Something is written only if cost != 0; returns True in this case.
- */
-static void fprint_fcost(VgFile *fp, AddrCost* c, AddrPos* last)
-{
-  TG_DEBUGIF(3) {
-    TG_DEBUG(2, "   print_fcost(file '%s', line %u, bb %#lx, addr %#lx):\n",
-	     c->p.file->name, c->p.line, c->p.bb_addr, c->p.addr);
-    TG_(print_cost)(-5, TG_(sets).full, c->cost);
-  }
-    
-  fprint_pos(fp, &(c->p), last);
-  copy_apos( last, &(c->p) ); /* update last to current position */
-
-  fprint_cost(fp, TG_(dumpmap), c->cost);
-
-  /* add cost to total */
-  TG_(add_and_zero_cost)( TG_(sets).full, dump_total_cost, c->cost );
-}
 
-
-/* Write out the calls from jcc (at pos)
- */
-static void fprint_jcc(VgFile *fp, jCC* jcc, AddrPos* curr, AddrPos* last,
-                       ULong ecounter)
-{
-    static AddrPos target;
-    file_node* file;
-    obj_node*  obj;
-
-    TG_DEBUGIF(2) {
-      TG_DEBUG(2, "   fprint_jcc (jkind %d)\n", (Int)jcc->jmpkind);
-      TG_(print_jcc)(-10, jcc);
-    }
-
-    TG_ASSERT(jcc->to !=0);
-    TG_ASSERT(jcc->from !=0);
-    
-    if (!get_debug_pos(jcc->to, bb_addr(jcc->to->bb), &target)) {
-	/* if we don't have debug info, don't switch to file "???" */
-	target.file = last->file;
-    }
-
-    if ((jcc->jmpkind == jk_CondJump) || (jcc->jmpkind == jk_Jump)) {
-	    
-      /* this is a JCC for a followed conditional or boring jump. */
-      TG_ASSERT(TG_(is_zero_cost)( TG_(sets).full, jcc->cost));
-	
-      /* objects among jumps should be the same.
-       * Otherwise this jump would have been changed to a call
-       *  (see setup_bbcc)
-       */
-      TG_ASSERT(jcc->from->bb->obj == jcc->to->bb->obj);
-
-	/* only print if target position info is useful */
-	if (!TG_(clo).dump_instr && !TG_(clo).dump_bb && target.line==0) {
-	  jcc->call_counter = 0;
-	  return;
-	}
-
-	/* Different files/functions are possible e.g. with longjmp's
-	 * which change the stack, and thus context
-	 */
-	if (last->file != target.file) {
-            print_file(fp, "jfi=", target.file);
-	}
-	
-	if (jcc->from->cxt != jcc->to->cxt) {
-	    if (TG_(clo).mangle_names)
-		print_mangled_fn(fp, "jfn",
-				 jcc->to->cxt, jcc->to->rec_index);
-	    else
-		print_fn(fp, "jfn", jcc->to->cxt->fn[0]);
-	}
-	    
-	if (jcc->jmpkind == jk_CondJump) {
-	    /* format: jcnd=<followed>/<executions> <target> */
-	    VG_(fprintf)(fp, "jcnd=%llu/%llu ",
-			 jcc->call_counter, ecounter);
-	}
-	else {
-	    /* format: jump=<jump count> <target> */
-	    VG_(fprintf)(fp, "jump=%llu ",
-			 jcc->call_counter);
-	}
-		
-	fprint_pos(fp, &target, last);
-	VG_(fprintf)(fp, "\n");
-	fprint_pos(fp, curr, last);
-	VG_(fprintf)(fp, "\n");
-
-	jcc->call_counter = 0;
-	return;
-    }
-
-    file = jcc->to->cxt->fn[0]->file;
-    obj  = jcc->to->bb->obj;
-    
-    /* object of called position different to object of this function?*/
-    if (jcc->from->cxt->fn[0]->file->obj != obj) {
-	print_obj(fp, "cob=", obj);
-    }
-
-    /* file of called position different to current file? */
-    if (last->file != file) {
-        print_file(fp, "cfi=", file);
-    }
-
-    if (TG_(clo).mangle_names)
-	print_mangled_fn(fp, "cfn", jcc->to->cxt, jcc->to->rec_index);
-    else
-	print_fn(fp, "cfn", jcc->to->cxt->fn[0]);
-
-    if (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost)) {
-        VG_(fprintf)(fp, "calls=%llu ", 
-		   jcc->call_counter);
-
-	fprint_pos(fp, &target, last);
-        VG_(fprintf)(fp, "\n");
-	fprint_pos(fp, curr, last);
-	fprint_cost(fp, TG_(dumpmap), jcc->cost);
-
-	TG_(init_cost)( TG_(sets).full, jcc->cost );
-
-	jcc->call_counter = 0;
-    }
-}
-
-
-
-/* Cost summation of functions.We use alternately ccSum[0/1], thus
- * ssSum[currSum] for recently read lines with same line number.
- */
-static AddrCost ccSum[2];
-static int currSum;
-
-/* Merge two sorted jCC lists.
- * Assumes both input lists are sorted by creation_seq.
- * Returns a new merged list that is also sorted by creation_seq.
- */
-static jCC* merge_jcc_lists(jCC* left, jCC* right) {
-    jCC dummy;
-    dummy.next_from = NULL;
-    jCC* tail = &dummy;
-
-    while (left && right) {
-        if (left->creation_seq <= right->creation_seq) {
-            tail->next_from = left;
-            left = left->next_from;
+    i = 0;
+    if (i < bufsize - 1) buf[i++] = '"';
+    for (p = src; *p && i < bufsize - 2; p++) {
+        if (*p == '"' && i < bufsize - 3) {
+            buf[i++] = '"';
+            buf[i++] = '"';
         } else {
-            tail->next_from = right;
-            right = right->next_from;
+            buf[i++] = *p;
         }
-        tail = tail->next_from;
-    }
-
-    tail->next_from = left ? left : right;
-    return dummy.next_from;
-}
-
-/* Merge sort for jCC lists to ensure chronological dump order.
- * Sorts by creation_seq field to preserve execution order.
- */
-static jCC* sort_jcc_list(jCC* head) {
-    if (!head || !head->next_from) return head;
-
-    /* Split list into two halves using slow/fast pointer technique */
-    jCC* slow = head;
-    jCC* fast = head->next_from;
-
-    while (fast && fast->next_from) {
-        slow = slow->next_from;
-        fast = fast->next_from->next_from;
     }
-
-    /* Split at midpoint */
-    jCC* mid = slow->next_from;
-    slow->next_from = NULL;
-
-    /* Recursively sort both halves and merge */
-    return merge_jcc_lists(sort_jcc_list(head), sort_jcc_list(mid));
+    if (i < bufsize - 1) buf[i++] = '"';
+    buf[i] = '\0';
+    return i;
 }
 
-/*
- * Print all costs of a BBCC:
- * - FCCs of instructions
- * - JCCs of the unique jump of this BB
- * returns True if something was written 
- */
-static Bool fprint_bbcc(VgFile *fp, BBCC* bbcc, AddrPos* last)
+void TG_(trace_open_output)(void)
 {
-  InstrInfo* instr_info;
-  ULong ecounter;
-  Bool something_written = False;
-  jCC* jcc;
-  AddrCost *currCost, *newCost;
-  Int jcc_count = 0, instr, i, jmp;
-  BB* bb = bbcc->bb;
-
-  TG_ASSERT(bbcc->cxt != 0);
-  TG_DEBUGIF(1) {
-    VG_(printf)("+ fprint_bbcc (Instr %u): ", bb->instr_count);
-    TG_(print_bbcc)(15, bbcc);
-  }
+    SysRes res;
+    HChar filename[512];
 
-  TG_ASSERT(currSum == 0 || currSum == 1);
-  currCost = &(ccSum[currSum]);
-  newCost  = &(ccSum[1-currSum]);
-
-  ecounter = bbcc->ecounter_sum;
-  jmp = 0;
-  instr_info = &(bb->instr[0]);
-  for(instr=0; instr<bb->instr_count; instr++, instr_info++) {
-
-    /* get debug info of current instruction address and dump cost
-     * if TG_(clo).dump_bbs or file/line has changed
-     */
-    Addr instr_addr = bb_addr(bb) + instr_info->instr_offset;
-    if (!get_debug_pos(bbcc, instr_addr, &(newCost->p))) {
-      /* if we don't have debug info, don't switch to file "???" */
-      newCost->p.file = bbcc->cxt->fn[0]->file;
-    }
+    if (TG_(trace_out).initialized) return;
 
-    if (TG_(clo).dump_bbs || TG_(clo).dump_instr ||
-	(newCost->p.line != currCost->p.line) ||
-	(newCost->p.file != currCost->p.file)) {
-      
-      if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
-	something_written = True;
-
-	/* Output file position and inline function markers */
-	fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
-
-	fprint_fcost(fp, currCost, last);
-      }
-	   
-      /* switch buffers */
-      currSum = 1 - currSum;
-      currCost = &(ccSum[currSum]);
-      newCost  = &(ccSum[1-currSum]);
-    }
-       
-    /* add line cost to current cost sum */
-    (*TG_(cachesim).add_icost)(currCost->cost, bbcc, instr_info, ecounter);
-
-    /* print jcc's if there are: only jumps */
-    if (bb->jmp[jmp].instr == instr) {
-	jcc_count=0;
-	for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from)
-	    if (((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
-		(!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
-	      jcc_count++;
-
-	if (jcc_count>0) {    
-	    if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
-		/* no need to switch buffers, as position is the same */
-		fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
-		fprint_fcost(fp, currCost, last);
-	    }
-	    get_debug_pos(bbcc, bb_addr(bb)+instr_info->instr_offset, &(currCost->p));
-	    fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
-	    something_written = True;
-
-        /* Sort jcc_list by creation sequence to ensure chronological order */
-        bbcc->jmp[jmp].jcc_list = sort_jcc_list(bbcc->jmp[jmp].jcc_list);
-	    for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
-		if (((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
-		    (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
-		    fprint_jcc(fp, jcc, &(currCost->p), last, ecounter);
-	    }
-	}
-    }
-
-    /* update execution counter */
-    if (jmp < bb->cjmp_count)
-	if (bb->jmp[jmp].instr == instr) {
-	    ecounter -= bbcc->jmp[jmp].ecounter;
-	    jmp++;
-	}
-  }
-  
-  /* jCCs at end? If yes, dump cumulated line info first */
-  jcc_count = 0;
-  for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
-      /* yes, if JCC only counts jmp arcs or cost >0 */
-      if ( ((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
-	   (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
-	  jcc_count++;
-  }
-  
-  if ( (bbcc->skipped &&
-	!TG_(is_zero_cost)(TG_(sets).full, bbcc->skipped)) ||
-       (jcc_count>0) ) {
-
-    if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
-      /* no need to switch buffers, as position is the same */
-      fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
-      fprint_fcost(fp, currCost, last);
-    }
-    
-    get_debug_pos(bbcc, bb_jmpaddr(bb), &(currCost->p));
-    fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
-    something_written = True;
-    
-    /* first, print skipped costs for calls */
-    if (bbcc->skipped && !TG_(is_zero_cost)( TG_(sets).full,
-					     bbcc->skipped )) {
-      TG_(add_and_zero_cost)( TG_(sets).full,
-			      currCost->cost, bbcc->skipped );
-#if 0
-      VG_(fprintf)(fp, "# Skipped\n");
-#endif
-      fprint_fcost(fp, currCost, last);
-    }
+    if (!TG_(clo).out_format)
+        TG_(clo).out_format = DEFAULT_OUTFORMAT;
 
-    if (jcc_count > 0) {
-	/* Sort jcc_list by creation sequence to ensure chronological order */
-	bbcc->jmp[jmp].jcc_list = sort_jcc_list(bbcc->jmp[jmp].jcc_list);
-	for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
-	    TG_ASSERT(jcc->jmp == jmp);
-	    if ( ((jcc->jmpkind != jk_Call) && (jcc->call_counter >0)) ||
-		 (!TG_(is_zero_cost)( TG_(sets).full, jcc->cost )))
+    HChar* expanded = VG_(expand_file_name)("--tracegrind-out-file",
+                                             TG_(clo).out_format);
+    VG_(strncpy)(filename, expanded, sizeof(filename) - 1);
+    filename[sizeof(filename) - 1] = '\0';
+    VG_(free)(expanded);
 
-		fprint_jcc(fp, jcc, &(currCost->p), last, ecounter);
-	}
+    res = VG_(open)(filename,
+                    VKI_O_CREAT|VKI_O_WRONLY|VKI_O_TRUNC,
+                    VKI_S_IRUSR|VKI_S_IWUSR);
+    if (sr_isError(res)) {
+        VG_(message)(Vg_UserMsg,
+                     "Error: cannot open trace output file '%s'\n", filename);
+        VG_(exit)(1);
     }
-  }
 
-  if (TG_(clo).dump_bbs || TG_(clo).dump_bb) {
-    if (!TG_(is_zero_cost)( TG_(sets).full, currCost->cost )) {
-      something_written = True;
+    TG_(trace_out).fd = (Int)sr_Res(res);
+    TG_(trace_out).seq = 0;
+    TG_(trace_out).initialized = True;
+    TG_(trace_out).header_written = False;
 
-      fprint_apos(fp, &(currCost->p), last, bbcc->cxt->fn[0]->file, bbcc);
-      fprint_fcost(fp, currCost, last);
-    }
-    if (TG_(clo).dump_bbs) VG_(fprintf)(fp, "\n");
-    
-    /* when every cost was immediately written, we must have done so,
-     * as this function is only called when there's cost in a BBCC
-     */
-    TG_ASSERT(something_written);
-  }
-  
-  bbcc->ecounter_sum = 0;
-  for(i=0; i<=bbcc->bb->cjmp_count; i++)
-    bbcc->jmp[i].ecounter = 0;
-  bbcc->ret_counter = 0;
-  
-  TG_DEBUG(1, "- fprint_bbcc: JCCs %d\n", jcc_count);
-  
-  return something_written;
+    if (VG_(clo_verbosity) > 1)
+        VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename);
 }
 
-/* order by
- *  recursion,
- *  from->bb->obj, from->bb->fn
- *  obj, fn[0]->file, fn
- *  address
+/* Write the CSV header row.
+ * Called lazily on first sample emission so that event sets are fully configured.
  */
-static int my_cmp(BBCC** pbbcc1, BBCC** pbbcc2)
+static void trace_write_header(void)
 {
-#if 0
-    return (*pbbcc1)->bb->offset - (*pbbcc2)->bb->offset;
-#else
-    BBCC *bbcc1 = *pbbcc1;
-    BBCC *bbcc2 = *pbbcc2;
-    Context* cxt1 = bbcc1->cxt;
-    Context* cxt2 = bbcc2->cxt;
-    int off = 1;
-
-    if (cxt1->fn[0]->file->obj != cxt2->fn[0]->file->obj)
-	return cxt1->fn[0]->file->obj - cxt2->fn[0]->file->obj;
-
-    if (cxt1->fn[0]->file != cxt2->fn[0]->file)
-	return cxt1->fn[0]->file - cxt2->fn[0]->file;
-
-    if (cxt1->fn[0] != cxt2->fn[0])
-	return cxt1->fn[0] - cxt2->fn[0];
-
-    if (bbcc1->rec_index != bbcc2->rec_index)
-	return bbcc1->rec_index - bbcc2->rec_index;
-
-    while((off < cxt1->size) && (off < cxt2->size)) {
-	fn_node* ffn1 = cxt1->fn[off];
-	fn_node* ffn2 = cxt2->fn[off];
-	if (ffn1->file->obj != ffn2->file->obj)
-	    return ffn1->file->obj - ffn2->file->obj;
-	if (ffn1 != ffn2)
-	    return ffn1 - ffn2;
-	off++;
-    }
-    if      (cxt1->size > cxt2->size) return 1;
-    else if (cxt1->size < cxt2->size) return -1;
+    HChar buf[4096];
+    Int pos = 0;
 
-    return bbcc1->bb->offset - bbcc2->bb->offset;
-#endif
-}
+    if (TG_(trace_out).header_written) return;
+    TG_(trace_out).header_written = True;
 
+    pos += VG_(sprintf)(buf + pos, "seq,tid,event,fn,obj,file,line");
 
-
-
-
-/* modified version of:
- *
- * qsort -- qsort interface implemented by faster quicksort.
- * J. L. Bentley and M. D. McIlroy, SPE 23 (1993) 1249-1265.
- * Copyright 1993, John Wiley.
-*/
-
-static __inline__
-void swap(BBCC** a, BBCC** b)
-{
-    BBCC* t;
-    t = *a; *a = *b; *b = t;
-}
-
-#if !defined(min)
-#define min(x, y) ((x)<=(y) ? (x) : (y))
-#endif
-
-static
-BBCC** med3(BBCC **a, BBCC **b, BBCC **c, int (*cmp)(BBCC**,BBCC**))
-{	return cmp(a, b) < 0 ?
-		  (cmp(b, c) < 0 ? b : cmp(a, c) < 0 ? c : a)
-		: (cmp(b, c) > 0 ? b : cmp(a, c) > 0 ? c : a);
-}
-
-static BBCC** qsort_start = 0;
-
-static void TG_(qsort)(BBCC **a, int n, int (*cmp)(BBCC**,BBCC**))
-{
-	BBCC **pa, **pb, **pc, **pd, **pl, **pm, **pn, **pv;
-	int s, r;
-	BBCC* v;
-
-	TG_DEBUG(8, "  qsort(%ld,%ld)\n", a-qsort_start + 0L, n + 0L);
-
-	if (n < 7) {	 /* Insertion sort on smallest arrays */
-		for (pm = a+1; pm < a+n; pm++)
-			for (pl = pm; pl > a && cmp(pl-1, pl) > 0; pl --)
-				swap(pl, pl-1);
-
-		TG_DEBUGIF(8) {
-		    for (pm = a; pm < a+n; pm++) {
-			VG_(printf)("   %3ld BB %#lx, ",
-                                    pm - qsort_start + 0L,
-				    bb_addr((*pm)->bb));      
-			TG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
-		    }
-		}
-		return;
-	}
-	pm = a + n/2;    /* Small arrays, middle element */
-	if (n > 7) {
-		pl = a;
-		pn = a + (n-1);
-		if (n > 40) {    /* Big arrays, pseudomedian of 9 */
-			s = n/8;
-			pl = med3(pl, pl+s, pl+2*s, cmp);
-			pm = med3(pm-s, pm, pm+s, cmp);
-			pn = med3(pn-2*s, pn-s, pn, cmp);
-		}
-		pm = med3(pl, pm, pn, cmp); /* Mid-size, med of 3 */
-	}
-
-
-	v = *pm;
-	pv = &v;
-	pa = pb = a;
-	pc = pd = a + (n-1);
-	for (;;) {
-		while ((pb <= pc) && ((r=cmp(pb, pv)) <= 0)) {
-		    if (r==0) {
-			/* same as pivot, to start */
-			swap(pa,pb); pa++; 
-		    }
-		    pb ++;
-		}
-		while ((pb <= pc) && ((r=cmp(pc, pv)) >= 0)) {
-		    if (r==0) {
-			/* same as pivot, to end */
-			swap(pc,pd); pd--; 
-		    }
-		    pc --;
-		}
-		if (pb > pc) { break; }
-		swap(pb, pc);
-		pb ++;
-		pc --;
-	}
-	pb--;
-	pc++;
-
-	/* put pivot from start into middle */
-	if ((s = pa-a)>0) { for(r=0;r<s;r++) swap(a+r, pb+1-s+r); }
-	/* put pivot from end into middle */
-	if ((s = a+n-1-pd)>0) { for(r=0;r<s;r++) swap(pc+r, a+n-s+r); }	    
-
-	TG_DEBUGIF(8) {
-	  VG_(printf)("   PV BB %#lx, ", bb_addr((*pv)->bb));
-	    TG_(print_cxt)(9, (*pv)->cxt, (*pv)->rec_index);
-
-	    s = pb-pa+1;
-	    VG_(printf)("    Lower %ld - %ld:\n",
-                        a-qsort_start + 0L,
-                        a+s-1-qsort_start + 0L);
-	    for (r=0;r<s;r++) {
-		pm = a+r;
-		VG_(printf)("     %3ld BB %#lx, ",
-			    pm-qsort_start + 0L,
-                            bb_addr((*pm)->bb));
-		TG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
-	    }
-
-	    s = pd-pc+1;
-	    VG_(printf)("    Upper %ld - %ld:\n",
-			a+n-s-qsort_start + 0L,
-                        a+n-1-qsort_start + 0L);
-	    for (r=0;r<s;r++) {
-		pm = a+n-s+r;
-		VG_(printf)("     %3ld BB %#lx, ",
-			    pm-qsort_start + 0L,
-                            bb_addr((*pm)->bb));
-		TG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
-	    }
-	}
-
-	if ((s = pb+1-pa) > 1) TG_(qsort)(a,     s, cmp);
-	if ((s = pd+1-pc) > 1) TG_(qsort)(a+n-s, s, cmp);
-}
-
-
-/* Helpers for prepare_dump */
-
-static Int    prepare_count;
-static BBCC** prepare_ptr;
-
-
-static void hash_addCount(BBCC* bbcc)
-{
-  if ((bbcc->ecounter_sum > 0) || (bbcc->ret_counter>0))
-    prepare_count++;
-}
-
-static void hash_addPtr(BBCC* bbcc)
-{
-  if ((bbcc->ecounter_sum == 0) &&
-      (bbcc->ret_counter == 0)) return;
-
-  *prepare_ptr = bbcc;
-  prepare_ptr++;
-}
-
-
-static void cs_addCount(thread_info* ti)
-{
-  Int i;
-  BBCC* bbcc;
-
-  /* add BBCCs with active call in call stack of current thread.
-   * update cost sums for active calls
-   */
-      
-  for(i = 0; i < TG_(current_call_stack).sp; i++) {
-    call_entry* e = &(TG_(current_call_stack).entry[i]);
-    if (e->jcc == 0) continue;
-    
-    TG_(add_diff_cost_lz)( TG_(sets).full, &(e->jcc->cost),
-			   e->enter_cost, TG_(current_state).cost);
-    bbcc = e->jcc->from;
-
-    TG_DEBUG(1, " [%2d] (tid %u), added active: %s\n",
-	     i,TG_(current_tid),bbcc->cxt->fn[0]->name);
-    
-    if (bbcc->ecounter_sum>0 || bbcc->ret_counter>0) {
-      /* already counted */
-      continue;
-    }
-    prepare_count++;
-  }
-}
-
-static void cs_addPtr(thread_info* ti)
-{
-  Int i;
-  BBCC* bbcc;
-
-  /* add BBCCs with active call in call stack of current thread.
-   * update cost sums for active calls
-   */
-      
-  for(i = 0; i < TG_(current_call_stack).sp; i++) {
-    call_entry* e = &(TG_(current_call_stack).entry[i]);
-    if (e->jcc == 0) continue;
-
-    bbcc = e->jcc->from;
-    
-    if (bbcc->ecounter_sum>0 || bbcc->ret_counter>0) {
-      /* already counted */
-      continue;
+    /* Emit column names for all events in the full event set */
+    EventSet* es = TG_(sets).full;
+    Int g, i;
+    for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+        if (!(es->mask & (1u << g))) continue;
+        EventGroup* eg = TG_(get_event_group)(g);
+        if (!eg) continue;
+        for (i = 0; i < eg->size; i++) {
+            pos += VG_(sprintf)(buf + pos, ",%s", eg->name[i]);
+        }
     }
 
-    *prepare_ptr = bbcc;
-    prepare_ptr++;
-  }
-}
-
-
-/**
- * Put all BBCCs with costs into a sorted array.
- * The returned arrays ends with a null pointer. 
- * Must be freed after dumping.
- */
-static
-BBCC** prepare_dump(void)
-{
-    BBCC **array;
-
-    prepare_count = 0;
-    
-    /* if we do not separate among threads, this gives all */
-    /* count number of BBCCs with >0 executions */
-    TG_(forall_bbccs)(hash_addCount);
-
-    /* even if we do not separate among threads,
-     * call stacks are separated */
-    if (TG_(clo).separate_threads)
-      cs_addCount(0);
-    else
-      TG_(forall_threads)(cs_addCount);
-
-    TG_DEBUG(0, "prepare_dump: %d BBCCs\n", prepare_count);
-
-    /* allocate bbcc array, insert BBCCs and sort */
-    prepare_ptr = array =
-      (BBCC**) TG_MALLOC("cl.dump.pd.1",
-                          (prepare_count+1) * sizeof(BBCC*));    
-
-    TG_(forall_bbccs)(hash_addPtr);
-
-    if (TG_(clo).separate_threads)
-      cs_addPtr(0);
-    else
-      TG_(forall_threads)(cs_addPtr);
-
-    TG_ASSERT(array + prepare_count == prepare_ptr);
-
-    /* end mark */
-    *prepare_ptr = 0;
-
-    TG_DEBUG(0,"             BBCCs inserted\n");
-
-    qsort_start = array;
-    TG_(qsort)(array, prepare_count, my_cmp);
-
-    TG_DEBUG(0,"             BBCCs sorted\n");
-
-    return array;
-}
-
-
-
-
-static void fprint_cost_ln(VgFile *fp, const HChar* prefix,
-			   const EventMapping* em, const ULong* cost)
-{
-    HChar *mcost = TG_(mappingcost_as_string)(em, cost);
-    VG_(fprintf)(fp, "%s%s\n", prefix, mcost);
-    TG_FREE(mcost);
-}
-
-static ULong bbs_done = 0;
-static HChar* filename = 0;
-
-static
-void file_err(void)
-{
-   VG_(message)(Vg_UserMsg,
-                "Error: can not open cache simulation output file `%s'\n",
-                filename );
-   VG_(exit)(1);
+    pos += VG_(sprintf)(buf + pos, "\n");
+    trace_write(buf, pos);
 }
 
-/**
- * Create a new dump file and write header.
- *
- * Naming: <TG_(clo).filename_base>.<pid>[.<part>][-<tid>]
- *         <part> is skipped for final dump (trigger==0)
- *         <tid>  is skipped for thread 1 with TG_(clo).separate_threads=no
- *
- * Returns the file descriptor, and -1 on error (no write permission)
- */
-static VgFile *new_dumpfile(int tid, const HChar* trigger)
+void TG_(trace_emit_sample)(ThreadId tid, const HChar* event_type,
+                             fn_node* fn)
 {
-    Bool appending = False;
-    int i;
-    FullCost sum = 0;
-    VgFile *fp;
+    HChar buf[4096];
+    HChar escaped[1024];
+    Int pos = 0;
+    Int i;
 
-    TG_ASSERT(dumps_initialized);
-    TG_ASSERT(filename != 0);
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
 
-    if (!TG_(clo).combine_dumps) {
-	i = VG_(sprintf)(filename, "%s", out_file);
-    
-	if (trigger)
-	    i += VG_(sprintf)(filename+i, ".%d", out_counter);
+    /* Lazily write header on first sample */
+    if (!TG_(trace_out).header_written)
+        trace_write_header();
 
-	if (TG_(clo).separate_threads)
-	    VG_(sprintf)(filename+i, "-%02d", tid);
+    /* Get current thread info for per-thread last_sample_cost */
+    thread_info* ti = TG_(get_current_thread)();
+    if (!ti) return;
 
-	fp = VG_(fopen)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
-    }
-    else {
-	VG_(sprintf)(filename, "%s", out_file);
-        fp = VG_(fopen)(filename, VKI_O_WRONLY|VKI_O_APPEND, 0);
-	if (fp && out_counter>1)
-	    appending = True;
-    }
+    EventSet* es = TG_(sets).full;
+    FullCost current_cost = TG_(current_state).cost;
 
-    if (fp == NULL) {
-	fp = VG_(fopen)(filename, VKI_O_CREAT|VKI_O_WRONLY,
-                        VKI_S_IRUSR|VKI_S_IWUSR);
-	if (fp == NULL) {
-	    /* If the file can not be opened for whatever reason (conflict
-	       between multiple supervised processes?), give up now. */
-	    file_err();
-	}
+    /* If last_sample_cost not yet allocated, allocate and zero it */
+    if (!ti->last_sample_cost) {
+        ti->last_sample_cost = TG_(get_eventset_cost)(es);
+        TG_(init_cost)(es, ti->last_sample_cost);
     }
 
-    TG_DEBUG(2, "  new_dumpfile '%s'\n", filename);
-
-    if (!appending)
-	reset_dump_array();
-
-
-    if (!appending) {
-	/* callgrind format specification, has to be on 1st line */
-	VG_(fprintf)(fp, "# callgrind format\n");
-
-	/* version */
-	VG_(fprintf)(fp, "version: 1\n");
-
-	/* creator */
-	VG_(fprintf)(fp, "creator: tracegrind-" VERSION "\n");
+    TG_(trace_out).seq++;
 
-	/* "pid:" line */
-	VG_(fprintf)(fp, "pid: %d\n", VG_(getpid)());
+    /* Resolve function info */
+    const HChar* fn_name = fn ? fn->name : "???";
+    const HChar* obj_name = (fn && fn->file && fn->file->obj)
+                            ? fn->file->obj->name : "???";
+    const HChar* file_name = (fn && fn->file) ? fn->file->name : "???";
+    UInt line = (fn && fn->file) ? 0 : 0;  /* line from fn_node's BB */
 
-	/* "cmd:" line */
-	VG_(fprintf)(fp, "cmd: %s", cmdbuf);
+    /* Try to get line number from the function's BB debug info */
+    if (fn && fn->pure_cxt) {
+        /* We could look up debug info here, but fn_node doesn't store line.
+         * The BB that was the entry point does store it. We use 0 as default. */
     }
 
-    VG_(fprintf)(fp, "\npart: %d\n", out_counter);
-    if (TG_(clo).separate_threads) {
-	VG_(fprintf)(fp, "thread: %d\n", tid);
-    }
-
-    /* "desc:" lines */
-    if (!appending) {
-        VG_(fprintf)(fp, "\n");
-
-#if 0
-	/* Global options changing the tracing behaviour */
-	VG_(fprintf)(fp, "\ndesc: Option: --skip-plt=%s\n",
-		     TG_(clo).skip_plt ? "yes" : "no");
-	VG_(fprintf)(fp, "desc: Option: --collect-jumps=%s\n",
-		     TG_(clo).collect_jumps ? "yes" : "no");
-	VG_(fprintf)(fp, "desc: Option: --separate-recs=%d\n",
-		     TG_(clo).separate_recursions);
-	VG_(fprintf)(fp, "desc: Option: --separate-callers=%d\n",
-		     TG_(clo).separate_callers);
-
-	VG_(fprintf)(fp, "desc: Option: --dump-bbs=%s\n",
-		     TG_(clo).dump_bbs ? "yes" : "no");
-	VG_(fprintf)(fp, "desc: Option: --separate-threads=%s\n",
-		     TG_(clo).separate_threads ? "yes" : "no");
-#endif
-
-	(*TG_(cachesim).dump_desc)(fp);
-    }
-
-    VG_(fprintf)(fp, "\ndesc: Timerange: Basic block %llu - %llu\n",
-		 bbs_done, TG_(stat).bb_executions);
-
-    VG_(fprintf)(fp, "desc: Trigger: %s\n",
-		 trigger ? trigger : "Program termination");
-
-#if 0
-   /* Output function specific config
-    * FIXME */
-   for (i = 0; i < N_FNCONFIG_ENTRIES; i++) {
-       fnc = fnc_table[i];
-       while (fnc) {
-	   if (fnc->skip) {
-	       VG_(fprintf)(fp, "desc: Option: --fn-skip=%s\n", fnc->name);
-	   }
-	   if (fnc->dump_at_enter) {
-	       VG_(fprintf)(fp, "desc: Option: --fn-dump-at-enter=%s\n",
-			    fnc->name);
-	   }   
-	   if (fnc->dump_at_leave) {
-	       VG_(fprintf)(fp, "desc: Option: --fn-dump-at-leave=%s\n",
-			    fnc->name);
-	   }
-	   if (fnc->separate_callers != TG_(clo).separate_callers) {
-	       VG_(fprintf)(fp, "desc: Option: --separate-callers%d=%s\n",
-			    fnc->separate_callers, fnc->name);
-	   }   
-	   if (fnc->separate_recursions != TG_(clo).separate_recursions) {
-	       VG_(fprintf)(fp, "desc: Option: --separate-recs%d=%s\n",
-			    fnc->separate_recursions, fnc->name);
-	   }   
-	   fnc = fnc->next;
-       }
-   }
-#endif
-
-   /* "positions:" line */
-   VG_(fprintf)(fp, "\npositions:%s%s%s\n",
-		TG_(clo).dump_instr ? " instr" : "",
-		TG_(clo).dump_bb    ? " bb" : "",
-		TG_(clo).dump_line  ? " line" : "");
-
-  /* Some (optional) "event:" lines, giving long names to events. */
-   switch (TG_(clo).collect_systime) {
-     case systime_no: break;
-     case systime_msec:
-        VG_(fprintf)(fp, "event: sysTime : sysTime (elapsed ms)\n");
-        break;
-     case systime_usec:
-        VG_(fprintf)(fp, "event: sysTime : sysTime (elapsed us)\n");
-        break;
-     case systime_nsec:
-        VG_(fprintf)(fp, "event: sysTime : sysTime (elapsed ns)\n");
-        VG_(fprintf)(fp, "event: sysCpuTime : sysCpuTime (system cpu ns)\n");
-        break;
-     default:
-        tl_assert(0);
-   }
-
-   /* "events:" line
-      Note: callgrind_annotate expects the "events:" line to be the last line
-      of the PartData.  In other words, this line is before the first line
-      of the PartData body. */
-   HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
-   VG_(fprintf)(fp, "events: %s\n", evmap);
-   VG_(free)(evmap);
-
-   /* summary lines */
-   sum = TG_(get_eventset_cost)( TG_(sets).full );
-   TG_(zero_cost)(TG_(sets).full, sum);
-   if (TG_(clo).separate_threads) {
-     thread_info* ti = TG_(get_current_thread)();
-     TG_(add_diff_cost)(TG_(sets).full, sum, ti->lastdump_cost,
-			   ti->states.entry[0]->cost);
-   }
-   else {
-     /* This function is called once for thread 1, where
-      * all costs are summed up when not dumping separate per thread.
-      * But this is not true for summary: we need to add all threads.
-      */
-     int t;
-     thread_info** thr = TG_(get_threads)();
-     for(t=1;t<VG_N_THREADS;t++) {
-       if (!thr[t]) continue;
-       TG_(add_diff_cost)(TG_(sets).full, sum,
-			  thr[t]->lastdump_cost,
-			  thr[t]->states.entry[0]->cost);
-     }
-   }
-   fprint_cost_ln(fp, "summary: ", TG_(dumpmap), sum);
-
-   /* all dumped cost will be added to total_fcc */
-   TG_(init_cost_lz)( TG_(sets).full, &dump_total_cost );
-
-   VG_(fprintf)(fp, "\n\n");
-
-   if (VG_(clo_verbosity) > 1)
-       VG_(message)(Vg_DebugMsg, "Dump to %s\n", filename);
-
-   return fp;
-}
-
-
-static void close_dumpfile(VgFile *fp)
-{
-    if (fp == NULL) return;
-
-    fprint_cost_ln(fp, "totals: ", TG_(dumpmap),
-		   dump_total_cost);
-    //fprint_fcc_ln(fp, "summary: ", &dump_total_fcc);
-    TG_(add_cost_lz)(TG_(sets).full, 
-		     &TG_(total_cost), dump_total_cost);
+    /* seq, tid, event */
+    pos += VG_(sprintf)(buf + pos, "%llu,%u,%s,",
+                        TG_(trace_out).seq,
+                        (UInt)tid,
+                        event_type);
 
-    VG_(fclose)(fp);
+    /* fn (escaped) */
+    csv_escape(escaped, sizeof(escaped), fn_name);
+    pos += VG_(sprintf)(buf + pos, "%s,", escaped);
 
-    if (filename[0] == '.') {
-	if (-1 == VG_(rename) (filename, filename+1)) {
-	    /* Can not rename to correct file name: give out warning */
-	    VG_(message)(Vg_DebugMsg, "Warning: Can not rename .%s to %s\n",
-			 filename, filename);
-       }
-   }
-}
-
-
-/* Helper for print_bbccs */
+    /* obj (escaped) */
+    csv_escape(escaped, sizeof(escaped), obj_name);
+    pos += VG_(sprintf)(buf + pos, "%s,", escaped);
 
-static const HChar* print_trigger;
+    /* file (escaped) */
+    csv_escape(escaped, sizeof(escaped), file_name);
+    pos += VG_(sprintf)(buf + pos, "%s,", escaped);
 
-static void print_bbccs_of_thread(thread_info* ti)
-{
-  BBCC **p, **array;
-  FnPos lastFnPos;
-  AddrPos lastAPos;
-
-  TG_DEBUG(1, "+ print_bbccs(tid %u)\n", TG_(current_tid));
+    /* line */
+    pos += VG_(sprintf)(buf + pos, "%u", line);
 
-  VgFile *print_fp = new_dumpfile(TG_(current_tid), print_trigger);
-  if (print_fp == NULL) {
-    TG_DEBUG(1, "- print_bbccs(tid %u): No output...\n", TG_(current_tid));
-    return;
-  }
-
-  p = array = prepare_dump();
-  init_fpos(&lastFnPos);
-  init_apos(&lastAPos, 0, 0, 0);
-
-  while(1) {
-
-    /* on context/function change, print old cost buffer before */
-    if (lastFnPos.cxt && ((*p==0) ||				 
-			 (lastFnPos.cxt != (*p)->cxt) ||
-			 (lastFnPos.rec_index != (*p)->rec_index))) {
-      if (!TG_(is_zero_cost)( TG_(sets).full, ccSum[currSum].cost )) {
-	/* no need to switch buffers, as position is the same */
-	fprint_apos(print_fp, &(ccSum[currSum].p), &lastAPos,
-		    lastFnPos.cxt->fn[0]->file, 0);
-	fprint_fcost(print_fp, &ccSum[currSum], &lastAPos);
-      }
-      
-      if (ccSum[currSum].p.file != lastFnPos.cxt->fn[0]->file) {
-	/* switch back to file of function */
-	print_file(print_fp, "fe=", lastFnPos.cxt->fn[0]->file);
-      }
-      VG_(fprintf)(print_fp, "\n");
-    }
-    
-    if (*p == 0) break;
-    
-    if (print_fn_pos(print_fp, &lastFnPos, *p)) {
-      
-      /* new function */
-      init_apos(&lastAPos, 0, 0, (*p)->cxt->fn[0]->file);
-      init_fcost(&ccSum[0], 0, 0, 0);
-      init_fcost(&ccSum[1], 0, 0, 0);
-      currSum = 0;
-      last_inline_fn = 0;  /* reset inline function tracking */
-    }
-    
-    if (TG_(clo).dump_bbs) {
-	/* FIXME: Specify Object of BB if different to object of fn */
-        int i;
-	ULong ecounter = (*p)->ecounter_sum;
-        VG_(fprintf)(print_fp, "bb=%#lx ", (UWord)(*p)->bb->offset);
-	for(i = 0; i<(*p)->bb->cjmp_count;i++) {
-	    VG_(fprintf)(print_fp, "%u %llu ", 
-				(*p)->bb->jmp[i].instr,
-				ecounter);
-	    ecounter -= (*p)->jmp[i].ecounter;
-	}
-	VG_(fprintf)(print_fp, "%u %llu\n", 
-		     (*p)->bb->instr_count,
-		     ecounter);
+    /* Compute and emit deltas for all event groups */
+    if (current_cost && ti->last_sample_cost) {
+        for (i = 0; i < es->size; i++) {
+            ULong delta = current_cost[i] - ti->last_sample_cost[i];
+            pos += VG_(sprintf)(buf + pos, ",%llu", delta);
+        }
+        /* Update last_sample_cost snapshot */
+        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+    } else {
+        /* No cost data available, emit zeros */
+        for (i = 0; i < es->size; i++) {
+            pos += VG_(sprintf)(buf + pos, ",0");
+        }
     }
-    
-    fprint_bbcc(print_fp, *p, &lastAPos);
-    
-    p++;
-  }
 
-  close_dumpfile(print_fp);
-  VG_(free)(array);
-  
-  /* set counters of last dump */
-  TG_(copy_cost)( TG_(sets).full, ti->lastdump_cost,
-		  TG_(current_state).cost );
-
-  TG_DEBUG(1, "- print_bbccs(tid %u)\n", TG_(current_tid));
+    pos += VG_(sprintf)(buf + pos, "\n");
+    trace_write(buf, pos);
 }
 
-
-static void print_bbccs(const HChar* trigger, Bool only_current_thread)
+void TG_(trace_close_output)(void)
 {
-  init_dump_array();
-  init_debug_cache();
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
 
-  print_trigger = trigger;
+    /* Write a totals summary comment at the end for verification */
+    if (TG_(total_cost)) {
+        HChar buf[4096];
+        Int pos = 0;
+        Int i;
+        EventSet* es = TG_(sets).full;
 
-  if (!TG_(clo).separate_threads) {
-    /* All BBCC/JCC costs is stored for thread 1 */
-    Int orig_tid = TG_(current_tid);
+        pos += VG_(sprintf)(buf + pos, "# totals:");
+        for (i = 0; i < es->size; i++) {
+            pos += VG_(sprintf)(buf + pos, " %llu", TG_(total_cost)[i]);
+        }
+        pos += VG_(sprintf)(buf + pos, "\n");
+        trace_write(buf, pos);
+    }
 
-    TG_(switch_thread)(1);
-    print_bbccs_of_thread( TG_(get_current_thread)() );
-    TG_(switch_thread)(orig_tid);
-  }
-  else if (only_current_thread)
-    print_bbccs_of_thread( TG_(get_current_thread)() );
-  else
-    TG_(forall_threads)(print_bbccs_of_thread);
+    VG_(close)(TG_(trace_out).fd);
+    TG_(trace_out).fd = -1;
+    TG_(trace_out).initialized = False;
 
-  free_dump_array();
+    if (VG_(clo_verbosity) > 1)
+        VG_(message)(Vg_DebugMsg,
+                     "Trace output closed (%llu samples written)\n",
+                     TG_(trace_out).seq);
 }
 
 
-void TG_(dump_profile)(const HChar* trigger, Bool only_current_thread)
-{
-   TG_DEBUG(2, "+ dump_profile(Trigger '%s')\n",
-	    trigger ? trigger : "Prg.Term.");
-
-   TG_(init_dumps)();
-
-   if (VG_(clo_verbosity) > 1)
-       VG_(message)(Vg_DebugMsg, "Start dumping at BB %llu (%s)...\n",
-		    TG_(stat).bb_executions,
-		    trigger ? trigger : "Prg.Term.");
-
-   out_counter++;
+/* ================================================================== */
+/* === Simplified dump (totals only, for verification)            === */
+/* ================================================================== */
 
-   print_bbccs(trigger, only_current_thread);
-
-   bbs_done = TG_(stat).bb_executions++;
-
-   if (VG_(clo_verbosity) > 1)
-     VG_(message)(Vg_DebugMsg, "Dumping done.\n");
-}
+/* Command buffer for dump header */
+static HChar *cmdbuf;
 
-/* Copy command to cmd buffer. We want to original command line
- * (can change at runtime)
- */
-static
-void init_cmdbuf(void)
+static void init_cmdbuf(void)
 {
   SizeT size;
   Int i,j;
 
-  /* Pass #1: How many bytes do we need? */
-  size  = 1;  // leading ' '
+  size  = 1;
   size += VG_(strlen)( VG_(args_the_exename) );
   for (i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
      const HChar *arg = *(HChar**)VG_(indexXA)( VG_(args_for_client), i );
-     size += 1;   // separator ' '
-     // escape NL in arguments to not break dump format
+     size += 1;
      for(j=0; arg[j]; j++)
        switch(arg[j]) {
        case '\n':
        case '\\':
-	 size++; // fall through
+	 size++;
+	 /* fallthrough */
        default:
 	 size++;
        }
   }
 
-  cmdbuf = TG_MALLOC("cl.dump.ic.1", size + 1);  // +1 for '\0'
+  cmdbuf = TG_MALLOC("tg.dump.ic.1", size + 1);
 
-  /* Pass #2: Build up the string */
   size = VG_(sprintf)(cmdbuf, " %s", VG_(args_the_exename));
 
   for(i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
@@ -1694,70 +333,68 @@ void init_cmdbuf(void)
   cmdbuf[size] = '\0';
 }
 
-/*
- * Set up file names for dump output: <out_file>.
- * <out_file> is derived from the output format string, which defaults
- * to "tracegrind.out.%p", where %p is replaced with the PID.
- * For the final file name, on intermediate dumps a counter is appended,
- * and further, if separate dumps per thread are requested, the thread ID.
- *
- * <out_file> always starts with a full absolute path.
- * If the output format string represents a relative path, the current
- * working directory at program start is used.
- *
- * This function has to be called every time a profile dump is generated
- * to be able to react on PID changes.
+
+/* Dump profile now only computes totals (no callgraph output).
+ * The real output is the streaming CSV trace.
  */
-void TG_(init_dumps)(void)
+void TG_(dump_profile)(const HChar* trigger, Bool only_current_thread)
 {
-   SysRes res;
+   TG_DEBUG(2, "+ dump_profile(Trigger '%s')\n",
+	    trigger ? trigger : "Prg.Term.");
+
+   TG_(init_dumps)();
+   out_counter++;
+
+   /* Compute totals from all threads */
+   if (!TG_(total_cost)) {
+       TG_(total_cost) = TG_(get_eventset_cost)(TG_(sets).full);
+       TG_(init_cost)(TG_(sets).full, TG_(total_cost));
+   }
+
+   /* Sum costs from all threads into total_cost */
+   {
+       Int t;
+       thread_info** thr = TG_(get_threads)();
+       for (t = 1; t < VG_N_THREADS; t++) {
+           if (!thr[t]) continue;
+           TG_(add_diff_cost)(TG_(sets).full, TG_(total_cost),
+                              thr[t]->lastdump_cost,
+                              thr[t]->states.entry[0]->cost);
+           /* Update lastdump_cost */
+           TG_(copy_cost)(TG_(sets).full, thr[t]->lastdump_cost,
+                          thr[t]->states.entry[0]->cost);
+       }
+   }
+
+   if (VG_(clo_verbosity) > 1)
+       VG_(message)(Vg_DebugMsg, "Dump done (trigger: %s).\n",
+                    trigger ? trigger : "Prg.Term.");
+}
+
 
+void TG_(init_dumps)(void)
+{
    static int thisPID = 0;
    int currentPID = VG_(getpid)();
    if (currentPID == thisPID) {
-       /* already initialized, and no PID change */
        TG_ASSERT(out_file != 0);
        return;
    }
    thisPID = currentPID;
-   
+
    if (!TG_(clo).out_format)
      TG_(clo).out_format = DEFAULT_OUTFORMAT;
 
-   /* If a file name was already set, clean up before */
    if (out_file) {
        VG_(free)(out_file);
-       VG_(free)(filename);
        out_counter = 0;
    }
 
-   // Setup output filename.
    out_file =
        VG_(expand_file_name)("--tracegrind-out-file", TG_(clo).out_format);
 
-   /* allocate space big enough for final filenames */
-   filename = (HChar*) TG_MALLOC("cl.dump.init_dumps.2",
-                                 VG_(strlen)(out_file)+32);
-       
-   /* Make sure the output base file can be written.
-    * This is used for the dump at program termination.
-    * We stop with an error here if we can not create the
-    * file: This is probably because of missing rights,
-    * and trace parts wouldn't be allowed to be written, too.
-    */ 
-    VG_(strcpy)(filename, out_file);
-    res = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
-    if (sr_isError(res)) { 
-	res = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
-		       VKI_S_IRUSR|VKI_S_IWUSR);
-	if (sr_isError(res)) {
-	    file_err(); 
-	}
-    }
-    if (!sr_isError(res)) VG_(close)( (Int)sr_Res(res) );
-
-    if (!dumps_initialized)
-	init_cmdbuf();
+   if (!dumps_initialized)
+       init_cmdbuf();
 
-    dumps_initialized = True;
+   dumps_initialized = True;
 }
diff --git a/tracegrind/global.h b/tracegrind/global.h
index dd3659785..e2fbfcd9f 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -588,6 +588,9 @@ struct _thread_info {
   FullCost lastdump_cost;    /* Cost at last dump */
   FullCost sighandler_cost;
 
+  /* CSV trace: per-thread snapshot of cost at last sample emission */
+  FullCost last_sample_cost;
+
   /* thread specific data structure containers */
   fn_array fn_active;
   jcc_hash jccs;
@@ -676,6 +679,18 @@ struct event_sets {
 #define fullOffset(group) (TG_(sets).full->offset[group])
 
 
+/*------------------------------------------------------------*/
+/*--- CSV trace output state                               ---*/
+/*------------------------------------------------------------*/
+
+typedef struct {
+    Int       fd;              /* Output file descriptor (-1 if not open) */
+    ULong     seq;             /* Global sequence counter */
+    Bool      initialized;     /* Has the output been opened? */
+    Bool      header_written;  /* Has the CSV header been written? */
+} trace_output;
+
+
 /*------------------------------------------------------------*/
 /*--- Functions                                            ---*/
 /*------------------------------------------------------------*/
@@ -783,6 +798,11 @@ void TG_(run_post_signal_on_call_stack_bottom)(void);
 /* from dump.c */
 void TG_(init_dumps)(void);
 
+/* CSV trace output (from dump.c) */
+void TG_(trace_open_output)(void);
+void TG_(trace_emit_sample)(ThreadId tid, const HChar* event_type, fn_node* fn);
+void TG_(trace_close_output)(void);
+
 /*------------------------------------------------------------*/
 /*--- Exported global variables                            ---*/
 /*------------------------------------------------------------*/
@@ -790,6 +810,7 @@ void TG_(init_dumps)(void);
 extern CommandLineOptions TG_(clo);
 extern Statistics TG_(stat);
 extern EventMapping* TG_(dumpmap);
+extern trace_output TG_(trace_out);
 
 /* Function active counter array, indexed by function number */
 extern UInt* TG_(fn_active_array);
diff --git a/tracegrind/main.c b/tracegrind/main.c
index c6fff12f5..33c204b54 100644
--- a/tracegrind/main.c
+++ b/tracegrind/main.c
@@ -1947,6 +1947,9 @@ void finish(void)
 
   TG_(dump_profile)(0, False);
 
+  /* Close CSV trace output */
+  TG_(trace_close_output)();
+
   if (VG_(clo_verbosity) == 0) return;
   
   if (VG_(clo_stats)) {
@@ -2098,11 +2101,13 @@ void TG_(post_clo_init)(void)
 
    TG_(instrument_state) = TG_(clo).instrument_atstart;
 
+   /* Open CSV trace output file */
+   TG_(trace_open_output)();
+
    if (VG_(clo_verbosity) > 0) {
       VG_(message)(Vg_UserMsg,
-                   "For interactive control, run 'tracegrind_control%s%s -h'.\n",
-                   (VG_(arg_vgdb_prefix) ? " " : ""),
-                   (VG_(arg_vgdb_prefix) ? VG_(arg_vgdb_prefix) : ""));
+                   "Streaming CSV trace output to tracegrind.out.%d\n",
+                   VG_(getpid)());
    }
 }
 
diff --git a/tracegrind/threads.c b/tracegrind/threads.c
index cb311e000..800d1d1f2 100644
--- a/tracegrind/threads.c
+++ b/tracegrind/threads.c
@@ -116,6 +116,9 @@ thread_info* new_thread(void)
     TG_(init_cost)( TG_(sets).full, t->lastdump_cost );
     TG_(init_cost)( TG_(sets).full, t->sighandler_cost );
 
+    /* CSV trace: per-thread sample snapshot (allocated lazily in trace_emit_sample) */
+    t->last_sample_cost = 0;
+
     /* init data containers */
     TG_(init_fn_array)( &(t->fn_active) );
     TG_(init_bbcc_hash)( &(t->bbccs) );

From 92113205a9d587bf1d1f573e9426be0fabc23f2e Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Thu, 5 Feb 2026 23:16:33 +0000
Subject: [PATCH 03/26] feat: add MsgPack+LZ4 output format for tracegrind

Add --output-format=csv|msgpack option. MsgPack format uses LZ4 block
compression achieving ~12x compression vs CSV.

New files:
- tg_msgpack.c/h: MsgPack encoder (write-only)
- tg_lz4.c/h: LZ4 compression wrapper with VG_() adaptations
- lz4.c/h: Vendored LZ4 library (BSD-2-Clause)
- docs/tracegrind-msgpack-format.md: Format specification

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tracegrind/Makefile.am                       |   10 +-
 tracegrind/callstack.c                       |    8 +-
 tracegrind/clo.c                             |    8 +
 tracegrind/docs/tracegrind-msgpack-format.md |  122 +
 tracegrind/dump.c                            |  360 ++-
 tracegrind/global.h                          |    9 +-
 tracegrind/lz4.c                             | 2832 ++++++++++++++++++
 tracegrind/lz4.h                             |  888 ++++++
 tracegrind/tg_lz4.c                          |   95 +
 tracegrind/tg_lz4.h                          |   21 +
 tracegrind/tg_msgpack.c                      |  214 ++
 tracegrind/tg_msgpack.h                      |   36 +
 12 files changed, 4540 insertions(+), 63 deletions(-)
 create mode 100644 tracegrind/docs/tracegrind-msgpack-format.md
 create mode 100644 tracegrind/lz4.c
 create mode 100644 tracegrind/lz4.h
 create mode 100644 tracegrind/tg_lz4.c
 create mode 100644 tracegrind/tg_lz4.h
 create mode 100644 tracegrind/tg_msgpack.c
 create mode 100644 tracegrind/tg_msgpack.h

diff --git a/tracegrind/Makefile.am b/tracegrind/Makefile.am
index e23377779..562c4f6c6 100644
--- a/tracegrind/Makefile.am
+++ b/tracegrind/Makefile.am
@@ -11,7 +11,11 @@ pkginclude_HEADERS = tracegrind.h
 noinst_HEADERS = \
 	costs.h \
 	events.h \
-	global.h
+	global.h \
+	lz4.c \
+	lz4.h \
+	tg_lz4.h \
+	tg_msgpack.h
 
 #----------------------------------------------------------------------------
 # tracegrind-<platform>
@@ -36,7 +40,9 @@ TRACEGRIND_SOURCES_COMMON = \
 	jumps.c \
 	main.c \
 	sim.c \
-	threads.c
+	threads.c \
+	tg_lz4.c \
+	tg_msgpack.c
 
 # We sneakily include "cg_branchpred.c" and "cg_arch.c" from cachegrind
 TRACEGRIND_CFLAGS_COMMON = -I$(top_srcdir)/cachegrind
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index 80480eec4..e1e8d84bf 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -250,10 +250,10 @@ void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
 
     TG_(current_call_stack).sp++;
 
-    /* Emit CSV trace sample on function entry */
+    /* Emit trace sample on function entry */
     if (!skip && TG_(current_state).collect) {
 	fn_node* to_fn = to->cxt->fn[0];
-	TG_(trace_emit_sample)(TG_(current_tid), "ENTER", to_fn);
+	TG_(trace_emit_sample)(TG_(current_tid), True, to_fn);
     }
 
     /* To allow for above assertion we set context of next frame to 0 */
@@ -359,9 +359,9 @@ void TG_(pop_call_stack)(void)
 	}
 	TG_(stat).ret_counter++;
 
-	/* Emit CSV trace sample on function exit */
+	/* Emit trace sample on function exit */
 	if (TG_(current_state).collect) {
-	    TG_(trace_emit_sample)(TG_(current_tid), "EXIT", to_fn);
+	    TG_(trace_emit_sample)(TG_(current_tid), False, to_fn);
 	}
 
 	/* restore context */
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index cd3a05012..50e4800e4 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -518,6 +518,11 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 
    else if VG_STR_CLO(arg, "--tracegrind-out-file", TG_(clo).out_format) {}
 
+   else if VG_XACT_CLO(arg, "--output-format=csv",
+                       TG_(clo).output_format, output_format_csv) {}
+   else if VG_XACT_CLO(arg, "--output-format=msgpack",
+                       TG_(clo).output_format, output_format_msgpack) {}
+
    else if VG_BOOL_CLO(arg, "--mangle-names", TG_(clo).mangle_names) {}
 
    else if VG_BOOL_CLO(arg, "--skip-direct-rec",
@@ -573,6 +578,7 @@ void TG_(print_usage)(void)
    VG_(printf)(
 "\n   dump creation options:\n"
 "    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p]\n"
+"    --output-format=csv|msgpack  Output format [csv]\n"
 "    --dump-line=no|yes        Dump source lines of costs? [yes]\n"
 "    --dump-instr=no|yes       Dump instruction address of costs? [no]\n"
 "    --compress-strings=no|yes Compress strings in profile dump? [yes]\n"
@@ -697,4 +703,6 @@ void TG_(set_clo_defaults)(void)
   TG_(clo).verbose = 0;
   TG_(clo).verbose_start = 0;
 #endif
+
+  TG_(clo).output_format = output_format_csv;
 }
diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
new file mode 100644
index 000000000..12b4c4658
--- /dev/null
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -0,0 +1,122 @@
+# Tracegrind MsgPack+LZ4 Output Format
+
+## Overview
+
+Tracegrind's `--output-format=msgpack` produces a binary trace file combining MsgPack serialization with LZ4 block compression. Files use the `.msgpack.lz4` extension.
+
+## File Structure
+
+```
+┌─────────────────────────────────┐
+│       File Header (8 bytes)     │
+├─────────────────────────────────┤
+│       Schema Chunk              │
+├─────────────────────────────────┤
+│       Data Chunk 1..N           │
+├─────────────────────────────────┤
+│       End Marker (8 bytes)      │
+└─────────────────────────────────┘
+```
+
+## File Header
+
+| Offset | Size | Field   | Description |
+|--------|------|---------|-------------|
+| 0      | 4    | magic   | ASCII `TGMP` (0x54 0x47 0x4D 0x50) |
+| 4      | 4    | version | Format version, uint32 LE (currently 1) |
+
+## Chunk Format
+
+Each chunk (schema and data) has the same header:
+
+| Offset | Size | Field             | Description |
+|--------|------|-------------------|-------------|
+| 0      | 4    | uncompressed_size | Size after decompression, uint32 LE |
+| 4      | 4    | compressed_size   | Size of LZ4 block, uint32 LE |
+| 8      | N    | data              | LZ4 block-compressed MsgPack data |
+
+## Schema Chunk
+
+The first chunk contains a MsgPack map:
+
+```json
+{
+    "version": 1,
+    "format": "tracegrind-msgpack",
+    "columns": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...]
+}
+```
+
+### Fixed Columns
+
+| Index | Name  | Type   | Description |
+|-------|-------|--------|-------------|
+| 0     | seq   | uint64 | Sequence number |
+| 1     | tid   | int32  | Thread ID |
+| 2     | event | int    | 0 = ENTER, 1 = EXIT |
+| 3     | fn    | string | Function name |
+| 4     | obj   | string | Shared object path |
+| 5     | file  | string | Source file path |
+| 6     | line  | int32  | Line number (0 if unknown) |
+
+### Event Columns (index 7+)
+
+Event counters as delta values: `Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`. Which columns are present depends on Tracegrind options.
+
+## Data Chunks
+
+Each data chunk contains concatenated MsgPack arrays (one per row):
+
+```
+[seq, tid, event, fn, obj, file, line, delta_Ir, ...]
+```
+
+The reference implementation writes 4096 rows per chunk.
+
+## End Marker
+
+8 zero bytes (uncompressed_size = 0, compressed_size = 0).
+
+## Example: Reading in Python
+
+```python
+import struct, lz4.block, msgpack
+
+def read_tracegrind(filepath):
+    with open(filepath, 'rb') as f:
+        assert f.read(4) == b'TGMP'
+        version = struct.unpack('<I', f.read(4))[0]
+
+        # Read schema chunk
+        usize, csize = struct.unpack('<II', f.read(8))
+        schema = msgpack.unpackb(
+            lz4.block.decompress(f.read(csize), uncompressed_size=usize))
+        columns = [c.decode() if isinstance(c, bytes) else c
+                   for c in schema[b'columns']]
+
+        # Read data chunks
+        rows = []
+        while True:
+            usize, csize = struct.unpack('<II', f.read(8))
+            if usize == 0 and csize == 0:
+                break
+            chunk = lz4.block.decompress(f.read(csize), uncompressed_size=usize)
+            unpacker = msgpack.Unpacker(raw=False)
+            unpacker.feed(chunk)
+            for row in unpacker:
+                rows.append(dict(zip(columns, row)))
+
+        return columns, rows
+```
+
+## References
+
+- [MsgPack Specification](https://github.com/msgpack/msgpack/blob/master/spec.md)
+- [LZ4 Block Format](https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md)
+
+## Reference Implementation
+
+- `tracegrind/tg_msgpack.c/h` - MsgPack encoder
+- `tracegrind/tg_lz4.c/h` - LZ4 compression wrapper
+- `tracegrind/lz4.c/h` - Vendored LZ4 library
+- `tracegrind/dump.c` - Trace output integration
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index c78c0f520..b01834e88 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -26,6 +26,8 @@
 
 #include "config.h"
 #include "global.h"
+#include "tg_msgpack.h"
+#include "tg_lz4.h"
 
 #include "pub_tool_threadstate.h"
 #include "pub_tool_libcfile.h"
@@ -56,6 +58,220 @@ trace_output TG_(trace_out) = { .fd = -1, .seq = 0,
                                 .initialized = False,
                                 .header_written = False };
 
+/* ================================================================== */
+/* === MsgPack + LZ4 output                                       === */
+/* ================================================================== */
+
+#define MSGPACK_CHUNK_ROWS 4096  /* Rows per compressed chunk */
+#define MSGPACK_INITIAL_BUF (256 * 1024)  /* Initial buffer size */
+
+typedef struct {
+    msgpack_buffer buf;      /* Buffer for serializing rows */
+    UInt rows_in_chunk;      /* Number of rows in current chunk */
+    UInt n_event_cols;       /* Number of dynamic event columns */
+    const HChar** col_names; /* Column names (for header) */
+    Int ncols;               /* Total columns including events */
+} msgpack_state;
+
+static msgpack_state mp_state;
+
+/* Write a compressed chunk to the trace output */
+static void msgpack_flush_chunk(void)
+{
+    if (mp_state.rows_in_chunk == 0) return;
+    if (TG_(trace_out).fd < 0) return;
+
+    /* Compress the msgpack data with zstd */
+    SizeT src_size = mp_state.buf.size;
+    SizeT dst_capacity = tg_lz4_compress_bound(src_size);
+    UChar* compressed = VG_(malloc)("tg.mp.compress", dst_capacity);
+
+    SizeT compressed_size = tg_lz4_compress(
+        compressed, dst_capacity,
+        mp_state.buf.data, src_size);
+
+    if (compressed_size == 0) {
+        /* Compression failed, write raw with size=0 marker */
+        VG_(free)(compressed);
+        return;
+    }
+
+    /* Write chunk header: 4 bytes uncompressed size, 4 bytes compressed size */
+    UChar hdr[8];
+    hdr[0] = (UChar)(src_size & 0xff);
+    hdr[1] = (UChar)((src_size >> 8) & 0xff);
+    hdr[2] = (UChar)((src_size >> 16) & 0xff);
+    hdr[3] = (UChar)((src_size >> 24) & 0xff);
+    hdr[4] = (UChar)(compressed_size & 0xff);
+    hdr[5] = (UChar)((compressed_size >> 8) & 0xff);
+    hdr[6] = (UChar)((compressed_size >> 16) & 0xff);
+    hdr[7] = (UChar)((compressed_size >> 24) & 0xff);
+    VG_(write)(TG_(trace_out).fd, hdr, 8);
+
+    /* Write compressed data */
+    VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
+
+    VG_(free)(compressed);
+
+    /* Reset buffer for next chunk */
+    msgpack_reset(&mp_state.buf);
+    mp_state.rows_in_chunk = 0;
+}
+
+/* Write file header with schema metadata */
+static void msgpack_write_header(void)
+{
+    msgpack_buffer hdr;
+    msgpack_init(&hdr, 1024);
+
+    /* Header is a map with metadata */
+    msgpack_write_map_header(&hdr, 3);
+
+    /* version */
+    msgpack_write_key(&hdr, "version");
+    msgpack_write_uint(&hdr, 1);
+
+    /* format */
+    msgpack_write_key(&hdr, "format");
+    msgpack_write_str(&hdr, "tracegrind-msgpack", -1);
+
+    /* columns */
+    msgpack_write_key(&hdr, "columns");
+    msgpack_write_array_header(&hdr, mp_state.ncols);
+    for (Int i = 0; i < mp_state.ncols; i++) {
+        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+    }
+
+    /* Compress and write header chunk (with special marker) */
+    SizeT src_size = hdr.size;
+    SizeT dst_capacity = tg_lz4_compress_bound(src_size);
+    UChar* compressed = VG_(malloc)("tg.mp.hdr", dst_capacity);
+
+    SizeT compressed_size = tg_lz4_compress(
+        compressed, dst_capacity, hdr.data, src_size);
+
+    /* Magic + version (8 bytes): "TGMP" + version(4) */
+    UChar magic[8] = {'T', 'G', 'M', 'P', 0x01, 0x00, 0x00, 0x00};
+    VG_(write)(TG_(trace_out).fd, magic, 8);
+
+    /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
+    UChar hdr_size[8];
+    hdr_size[0] = (UChar)(src_size & 0xff);
+    hdr_size[1] = (UChar)((src_size >> 8) & 0xff);
+    hdr_size[2] = (UChar)((src_size >> 16) & 0xff);
+    hdr_size[3] = (UChar)((src_size >> 24) & 0xff);
+    hdr_size[4] = (UChar)(compressed_size & 0xff);
+    hdr_size[5] = (UChar)((compressed_size >> 8) & 0xff);
+    hdr_size[6] = (UChar)((compressed_size >> 16) & 0xff);
+    hdr_size[7] = (UChar)((compressed_size >> 24) & 0xff);
+    VG_(write)(TG_(trace_out).fd, hdr_size, 8);
+
+    /* Compressed header data */
+    VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
+
+    VG_(free)(compressed);
+    msgpack_free(&hdr);
+}
+
+/* Initialize msgpack state with schema from event sets */
+static void msgpack_init_state(void)
+{
+    EventSet* es = TG_(sets).full;
+    Int g, i;
+
+    /* Count dynamic event columns */
+    Int n_events = 0;
+    for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+        if (!(es->mask & (1u << g))) continue;
+        EventGroup* eg = TG_(get_event_group)(g);
+        if (!eg) continue;
+        n_events += eg->size;
+    }
+
+    mp_state.n_event_cols = n_events;
+    mp_state.ncols = 7 + n_events;  /* 7 fixed + dynamic */
+
+    /* Allocate column names array */
+    mp_state.col_names = VG_(malloc)("tg.mp.cols",
+                                      mp_state.ncols * sizeof(HChar*));
+
+    /* Fixed columns */
+    mp_state.col_names[0] = "seq";
+    mp_state.col_names[1] = "tid";
+    mp_state.col_names[2] = "event";
+    mp_state.col_names[3] = "fn";
+    mp_state.col_names[4] = "obj";
+    mp_state.col_names[5] = "file";
+    mp_state.col_names[6] = "line";
+
+    /* Dynamic event columns */
+    Int c = 7;
+    for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+        if (!(es->mask & (1u << g))) continue;
+        EventGroup* eg = TG_(get_event_group)(g);
+        if (!eg) continue;
+        for (i = 0; i < eg->size; i++) {
+            mp_state.col_names[c++] = eg->name[i];
+        }
+    }
+
+    /* Initialize buffer */
+    msgpack_init(&mp_state.buf, MSGPACK_INITIAL_BUF);
+    mp_state.rows_in_chunk = 0;
+
+    /* Write file header */
+    msgpack_write_header();
+}
+
+/* Add a row to the msgpack output */
+static void msgpack_add_row(ULong seq, Int tid, Int event,
+                            const HChar* fn_name, const HChar* obj_name,
+                            const HChar* file_name, Int line,
+                            const ULong* deltas, Int n_deltas)
+{
+    /* Each row is a msgpack array */
+    msgpack_write_array_header(&mp_state.buf, mp_state.ncols);
+
+    /* Fixed columns */
+    msgpack_write_uint(&mp_state.buf, seq);
+    msgpack_write_int(&mp_state.buf, tid);
+    msgpack_write_int(&mp_state.buf, event);  /* 0=ENTER, 1=EXIT */
+    msgpack_write_str(&mp_state.buf, fn_name, -1);
+    msgpack_write_str(&mp_state.buf, obj_name, -1);
+    msgpack_write_str(&mp_state.buf, file_name, -1);
+    msgpack_write_int(&mp_state.buf, line);
+
+    /* Event delta columns */
+    for (Int i = 0; i < n_deltas; i++) {
+        msgpack_write_uint(&mp_state.buf, deltas[i]);
+    }
+
+    mp_state.rows_in_chunk++;
+
+    /* Flush if chunk is full */
+    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+        msgpack_flush_chunk();
+    }
+}
+
+/* Close msgpack output */
+static void msgpack_close_output(void)
+{
+    /* Flush any remaining rows */
+    msgpack_flush_chunk();
+
+    /* Write end marker (zero-size chunk) */
+    UChar end[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    VG_(write)(TG_(trace_out).fd, end, 8);
+
+    /* Cleanup */
+    msgpack_free(&mp_state.buf);
+    if (mp_state.col_names) {
+        VG_(free)(mp_state.col_names);
+        mp_state.col_names = NULL;
+    }
+}
+
 /* Write a string to the trace output fd */
 static void trace_write(const HChar* buf, Int len)
 {
@@ -119,6 +335,15 @@ void TG_(trace_open_output)(void)
     filename[sizeof(filename) - 1] = '\0';
     VG_(free)(expanded);
 
+    /* Append format-specific suffix */
+    if (TG_(clo).output_format == output_format_msgpack) {
+        SizeT len = VG_(strlen)(filename);
+        if (len + 12 < sizeof(filename)) {
+            VG_(strncpy)(filename + len, ".msgpack.lz4", sizeof(filename) - len - 1);
+            filename[sizeof(filename) - 1] = '\0';
+        }
+    }
+
     res = VG_(open)(filename,
                     VKI_O_CREAT|VKI_O_WRONLY|VKI_O_TRUNC,
                     VKI_S_IRUSR|VKI_S_IWUSR);
@@ -133,6 +358,11 @@ void TG_(trace_open_output)(void)
     TG_(trace_out).initialized = True;
     TG_(trace_out).header_written = False;
 
+    /* Initialize format-specific writer */
+    if (TG_(clo).output_format == output_format_msgpack) {
+        msgpack_init_state();
+    }
+
     if (VG_(clo_verbosity) > 1)
         VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename);
 }
@@ -166,21 +396,14 @@ static void trace_write_header(void)
     trace_write(buf, pos);
 }
 
-void TG_(trace_emit_sample)(ThreadId tid, const HChar* event_type,
+void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
                              fn_node* fn)
 {
-    HChar buf[4096];
-    HChar escaped[1024];
-    Int pos = 0;
     Int i;
 
     if (!TG_(trace_out).initialized) return;
     if (TG_(trace_out).fd < 0) return;
 
-    /* Lazily write header on first sample */
-    if (!TG_(trace_out).header_written)
-        trace_write_header();
-
     /* Get current thread info for per-thread last_sample_cost */
     thread_info* ti = TG_(get_current_thread)();
     if (!ti) return;
@@ -201,52 +424,70 @@ void TG_(trace_emit_sample)(ThreadId tid, const HChar* event_type,
     const HChar* obj_name = (fn && fn->file && fn->file->obj)
                             ? fn->file->obj->name : "???";
     const HChar* file_name = (fn && fn->file) ? fn->file->name : "???";
-    UInt line = (fn && fn->file) ? 0 : 0;  /* line from fn_node's BB */
+    UInt line = 0;
 
-    /* Try to get line number from the function's BB debug info */
-    if (fn && fn->pure_cxt) {
-        /* We could look up debug info here, but fn_node doesn't store line.
-         * The BB that was the entry point does store it. We use 0 as default. */
+    /* Compute deltas for all event counters */
+    ULong deltas[64]; /* es->size is always small */
+    tl_assert(es->size <= 64);
+    if (current_cost && ti->last_sample_cost) {
+        for (i = 0; i < es->size; i++) {
+            deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+        }
+        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+    } else {
+        for (i = 0; i < es->size; i++) {
+            deltas[i] = 0;
+        }
     }
 
-    /* seq, tid, event */
-    pos += VG_(sprintf)(buf + pos, "%llu,%u,%s,",
-                        TG_(trace_out).seq,
-                        (UInt)tid,
-                        event_type);
+    /* Event type: 0=ENTER, 1=EXIT */
+    Int event_val = is_enter ? 0 : 1;
+    const HChar* event_str = is_enter ? "ENTER" : "EXIT";
 
-    /* fn (escaped) */
-    csv_escape(escaped, sizeof(escaped), fn_name);
-    pos += VG_(sprintf)(buf + pos, "%s,", escaped);
+    if (TG_(clo).output_format == output_format_msgpack) {
+        /* --- MsgPack + LZ4 path --- */
+        msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val,
+                        fn_name, obj_name, file_name, (Int)line,
+                        deltas, es->size);
+    } else {
+        /* --- CSV path --- */
+        HChar buf[4096];
+        HChar escaped[1024];
+        Int pos = 0;
 
-    /* obj (escaped) */
-    csv_escape(escaped, sizeof(escaped), obj_name);
-    pos += VG_(sprintf)(buf + pos, "%s,", escaped);
+        /* Lazily write header on first sample */
+        if (!TG_(trace_out).header_written)
+            trace_write_header();
 
-    /* file (escaped) */
-    csv_escape(escaped, sizeof(escaped), file_name);
-    pos += VG_(sprintf)(buf + pos, "%s,", escaped);
+        /* seq, tid, event */
+        pos += VG_(sprintf)(buf + pos, "%llu,%u,%s,",
+                            TG_(trace_out).seq,
+                            (UInt)tid,
+                            event_str);
 
-    /* line */
-    pos += VG_(sprintf)(buf + pos, "%u", line);
+        /* fn (escaped) */
+        csv_escape(escaped, sizeof(escaped), fn_name);
+        pos += VG_(sprintf)(buf + pos, "%s,", escaped);
 
-    /* Compute and emit deltas for all event groups */
-    if (current_cost && ti->last_sample_cost) {
-        for (i = 0; i < es->size; i++) {
-            ULong delta = current_cost[i] - ti->last_sample_cost[i];
-            pos += VG_(sprintf)(buf + pos, ",%llu", delta);
-        }
-        /* Update last_sample_cost snapshot */
-        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
-    } else {
-        /* No cost data available, emit zeros */
+        /* obj (escaped) */
+        csv_escape(escaped, sizeof(escaped), obj_name);
+        pos += VG_(sprintf)(buf + pos, "%s,", escaped);
+
+        /* file (escaped) */
+        csv_escape(escaped, sizeof(escaped), file_name);
+        pos += VG_(sprintf)(buf + pos, "%s,", escaped);
+
+        /* line */
+        pos += VG_(sprintf)(buf + pos, "%u", line);
+
+        /* event deltas */
         for (i = 0; i < es->size; i++) {
-            pos += VG_(sprintf)(buf + pos, ",0");
+            pos += VG_(sprintf)(buf + pos, ",%llu", deltas[i]);
         }
-    }
 
-    pos += VG_(sprintf)(buf + pos, "\n");
-    trace_write(buf, pos);
+        pos += VG_(sprintf)(buf + pos, "\n");
+        trace_write(buf, pos);
+    }
 }
 
 void TG_(trace_close_output)(void)
@@ -254,22 +495,29 @@ void TG_(trace_close_output)(void)
     if (!TG_(trace_out).initialized) return;
     if (TG_(trace_out).fd < 0) return;
 
-    /* Write a totals summary comment at the end for verification */
-    if (TG_(total_cost)) {
-        HChar buf[4096];
-        Int pos = 0;
-        Int i;
-        EventSet* es = TG_(sets).full;
-
-        pos += VG_(sprintf)(buf + pos, "# totals:");
-        for (i = 0; i < es->size; i++) {
-            pos += VG_(sprintf)(buf + pos, " %llu", TG_(total_cost)[i]);
+    if (TG_(clo).output_format == output_format_msgpack) {
+        /* MsgPack close flushes remaining rows, writes end marker, closes fd */
+        msgpack_close_output();
+        VG_(close)(TG_(trace_out).fd);
+    } else {
+        /* Write a totals summary comment at the end for verification */
+        if (TG_(total_cost)) {
+            HChar buf[4096];
+            Int pos = 0;
+            Int i;
+            EventSet* es = TG_(sets).full;
+
+            pos += VG_(sprintf)(buf + pos, "# totals:");
+            for (i = 0; i < es->size; i++) {
+                pos += VG_(sprintf)(buf + pos, " %llu", TG_(total_cost)[i]);
+            }
+            pos += VG_(sprintf)(buf + pos, "\n");
+            trace_write(buf, pos);
         }
-        pos += VG_(sprintf)(buf + pos, "\n");
-        trace_write(buf, pos);
+
+        VG_(close)(TG_(trace_out).fd);
     }
 
-    VG_(close)(TG_(trace_out).fd);
     TG_(trace_out).fd = -1;
     TG_(trace_out).initialized = False;
 
diff --git a/tracegrind/global.h b/tracegrind/global.h
index e2fbfcd9f..dd9afd561 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -78,6 +78,11 @@ typedef enum {
    systime_nsec
 } Collect_Systime;
 
+typedef enum {
+   output_format_csv,
+   output_format_msgpack
+} OutputFormat;
+
 typedef struct _CommandLineOptions CommandLineOptions;
 struct _CommandLineOptions {
 
@@ -126,6 +131,8 @@ struct _CommandLineOptions {
   Int   verbose;
   ULong verbose_start;
 #endif
+
+  OutputFormat output_format;  /* csv or msgpack */
 };
 
 /*------------------------------------------------------------*/
@@ -800,7 +807,7 @@ void TG_(init_dumps)(void);
 
 /* CSV trace output (from dump.c) */
 void TG_(trace_open_output)(void);
-void TG_(trace_emit_sample)(ThreadId tid, const HChar* event_type, fn_node* fn);
+void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
 void TG_(trace_close_output)(void);
 
 /*------------------------------------------------------------*/
diff --git a/tracegrind/lz4.c b/tracegrind/lz4.c
new file mode 100644
index 000000000..a1f02e75d
--- /dev/null
+++ b/tracegrind/lz4.c
@@ -0,0 +1,2832 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (c) Yann Collet. All rights reserved.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*-************************************
+*  Tuning parameters
+**************************************/
+/*
+ * LZ4_HEAPMODE :
+ * Select how stateless compression functions like `LZ4_compress_default()`
+ * allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#ifndef LZ4_HEAPMODE
+#  define LZ4_HEAPMODE 0
+#endif
+
+/*
+ * LZ4_ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define LZ4_ACCELERATION_DEFAULT 1
+/*
+ * LZ4_ACCELERATION_MAX :
+ * Any "acceleration" value higher than this threshold
+ * get treated as LZ4_ACCELERATION_MAX instead (fix #876)
+ */
+#define LZ4_ACCELERATION_MAX 65537
+
+
+/*-************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which assembly generation depends on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
+#  if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \
+  || (defined(__riscv) && defined(__riscv_zicclsm)) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) || defined(_MSC_VER)
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
+#  undef  LZ4_FORCE_SW_BITCOUNT  /* avoid double def */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+
+/*-************************************
+*  Dependency
+**************************************/
+/*
+ * LZ4_SRC_INCLUDED:
+ * Amalgamation flag, whether lz4.c is included
+ */
+#ifndef LZ4_SRC_INCLUDED
+#  define LZ4_SRC_INCLUDED 1
+#endif
+
+#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */
+#endif
+
+#ifndef LZ4_STATIC_LINKING_ONLY
+#  define LZ4_STATIC_LINKING_ONLY
+#endif
+#include "lz4.h"
+/* see also "memory routines" below */
+
+
+/*-************************************
+*  Compiler Options
+**************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)  /* Visual Studio 2005+ */
+#  include <intrin.h>               /* only present in VS2005+ */
+#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 6237)   /* disable: C6237: conditional expression is always 0 */
+#  pragma warning(disable : 6239)   /* disable: C6239: (<non-zero constant> && <expression>) always evaluates to the result of <expression> */
+#  pragma warning(disable : 6240)   /* disable: C6240: (<expression> && <non-zero constant>) always evaluates to the result of <expression> */
+#  pragma warning(disable : 6326)   /* disable: C6326: Potential comparison of a constant with another constant */
+#endif  /* _MSC_VER */
+
+#ifndef LZ4_FORCE_INLINE
+#  if defined (_MSC_VER) && !defined (__clang__)    /* MSVC */
+#    define LZ4_FORCE_INLINE static __forceinline
+#  else
+#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#      if defined (__GNUC__) || defined (__clang__)
+#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#      else
+#        define LZ4_FORCE_INLINE static inline
+#      endif
+#    else
+#      define LZ4_FORCE_INLINE static
+#    endif /* __STDC_VERSION__ */
+#  endif  /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
+#  undef LZ4_FORCE_INLINE
+#  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
+#else
+#  define LZ4_FORCE_O2
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#ifndef likely
+#define likely(expr)     expect((expr) != 0, 1)
+#endif
+#ifndef unlikely
+#define unlikely(expr)   expect((expr) != 0, 0)
+#endif
+
+/* Should the alignment test prove unreliable, for some reason,
+ * it can be disabled by setting LZ4_ALIGN_TEST to 0 */
+#ifndef LZ4_ALIGN_TEST  /* can be externally provided */
+# define LZ4_ALIGN_TEST 1
+#endif
+
+
+/*-************************************
+*  Memory routines
+**************************************/
+
+/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION :
+ *  Disable relatively high-level LZ4/HC functions that use dynamic memory
+ *  allocation functions (malloc(), calloc(), free()).
+ *
+ *  Note that this is a compile-time switch. And since it disables
+ *  public/stable LZ4 v1 API functions, we don't recommend using this
+ *  symbol to generate a library for distribution.
+ *
+ *  The following public functions are removed when this symbol is defined.
+ *  - lz4   : LZ4_createStream, LZ4_freeStream,
+ *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated)
+ *  - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC,
+ *            LZ4_createHC (deprecated), LZ4_freeHC  (deprecated)
+ *  - lz4frame, lz4file : All LZ4F_* functions
+ */
+#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+#  define ALLOC(s)          lz4_error_memory_allocation_is_disabled
+#  define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
+#  define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
+#elif defined(LZ4_USER_MEMORY_FUNCTIONS)
+/* memory management functions can be customized by user project.
+ * Below functions must exist somewhere in the Project
+ * and be available at link time */
+void* LZ4_malloc(size_t s);
+void* LZ4_calloc(size_t n, size_t s);
+void  LZ4_free(void* p);
+# define ALLOC(s)          LZ4_malloc(s)
+# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s)
+# define FREEMEM(p)        LZ4_free(p)
+#else
+# include <stdlib.h>   /* malloc, calloc, free */
+# define ALLOC(s)          malloc(s)
+# define ALLOC_AND_ZERO(s) calloc(1,s)
+# define FREEMEM(p)        free(p)
+#endif
+
+#if ! LZ4_FREESTANDING
+#  include <string.h>   /* memset, memcpy */
+#endif
+#if !defined(LZ4_memset)
+#  define LZ4_memset(p,v,s) memset((p),(v),(s))
+#endif
+#define MEM_INIT(p,v,s)   LZ4_memset((p),(v),(s))
+
+
+/*-************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
+#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#endif
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*-************************************
+*  Error detection
+**************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define LZ4_STATIC_ASSERT(c)   { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2)
+#  include <stdio.h>
+   static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                          \
+        if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+            fprintf(stderr, __FILE__  " %i: ", __LINE__); \
+            fprintf(stderr, __VA_ARGS__);             \
+            fprintf(stderr, " \n");                   \
+    }   }
+#else
+#  define DEBUGLOG(l, ...) {}    /* disabled */
+#endif
+
+static int LZ4_isAligned(const void* ptr, size_t alignment)
+{
+    return ((size_t)ptr & (alignment -1)) == 0;
+}
+
+
+/*-************************************
+*  Types
+**************************************/
+#include <limits.h>
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef unsigned char BYTE; /*uint8_t not necessarily blessed to alias arbitrary type*/
+  typedef uint16_t      U16;
+  typedef uint32_t      U32;
+  typedef  int32_t      S32;
+  typedef uint64_t      U64;
+  typedef uintptr_t     uptrval;
+#else
+# if UINT_MAX != 4294967295UL
+#   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+# endif
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#endif
+
+#if defined(__x86_64__)
+  typedef U64    reg_t;   /* 64-bits in x32 mode */
+#else
+  typedef size_t reg_t;   /* 32-bits in x32 mode */
+#endif
+
+typedef enum {
+    notLimited = 0,
+    limitedOutput = 1,
+    fillOutput = 2
+} limitedOutput_directive;
+
+
+/*-************************************
+*  Reading and writing into memory
+**************************************/
+
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
+ */
+#if !defined(LZ4_memcpy)
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#  else
+#    define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#  endif
+#endif
+
+#if !defined(LZ4_memmove)
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4_memmove __builtin_memmove
+#  else
+#    define LZ4_memmove memmove
+#  endif
+#endif
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+    return one.c[0];
+}
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define LZ4_PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
+#elif defined(_MSC_VER)
+#define LZ4_PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
+#endif
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
+
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+LZ4_PACK(typedef struct { U16 u16; }) LZ4_unalign16;
+LZ4_PACK(typedef struct { U32 u32; }) LZ4_unalign32;
+LZ4_PACK(typedef struct { reg_t uArch; }) LZ4_unalignST;
+
+static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign16*)ptr)->u16; }
+static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign32*)ptr)->u32; }
+static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalignST*)ptr)->uArch; }
+
+static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign16*)memPtr)->u16 = value; }
+static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign32*)memPtr)->u32 = value; }
+
+#else  /* safe and portable access using memcpy() */
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static reg_t LZ4_read_ARCH(const void* memPtr)
+{
+    reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+static void LZ4_write32(void* memPtr, U32 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
+
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read16(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] | (p[1]<<8));
+    }
+}
+
+#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT
+static U32 LZ4_readLE32(const void* memPtr)
+{
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read32(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
+    }
+}
+#endif
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian()) {
+        LZ4_write16(memPtr, value);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
+
+static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+#  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__)
+#    if defined(__clang__) && defined(__ANDROID__)
+     /* On Android aarch64, we disable this optimization for clang because
+      * on certain mobile chipsets, performance is reduced with clang. For
+      * more information refer to https://github.com/lz4/lz4/pull/707 */
+#      define LZ4_FAST_DEC_LOOP 0
+#    else
+#      define LZ4_FAST_DEC_LOOP 1
+#    endif
+#  else
+#    define LZ4_FAST_DEC_LOOP 0
+#  endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    assert(srcPtr + offset == dstPtr);
+    if (offset < 8) {
+        LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+        srcPtr += inc32table[offset];
+        LZ4_memcpy(dstPtr+4, srcPtr, 4);
+        srcPtr -= dec64table[offset];
+        dstPtr += 8;
+    } else {
+        LZ4_memcpy(dstPtr, srcPtr, 8);
+        dstPtr += 8;
+        srcPtr += 8;
+    }
+
+    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
+
+/* LZ4_memcpy_using_offset()  presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 12 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    BYTE v[8];
+
+    assert(dstEnd >= dstPtr + MINMATCH);
+
+    switch(offset) {
+    case 1:
+        MEM_INIT(v, *srcPtr, 8);
+        break;
+    case 2:
+        LZ4_memcpy(v, srcPtr, 2);
+        LZ4_memcpy(&v[2], srcPtr, 2);
+#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */
+#  pragma warning(push)
+#  pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
+#endif
+        LZ4_memcpy(&v[4], v, 4);
+#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */
+#  pragma warning(pop)
+#endif
+        break;
+    case 4:
+        LZ4_memcpy(v, srcPtr, 4);
+        LZ4_memcpy(&v[4], srcPtr, 4);
+        break;
+    default:
+        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+        return;
+    }
+
+    LZ4_memcpy(dstPtr, v, 8);
+    dstPtr += 8;
+    while (dstPtr < dstEnd) {
+        LZ4_memcpy(dstPtr, v, 8);
+        dstPtr += 8;
+    }
+}
+#endif
+
+
+/*-************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (reg_t val)
+{
+    assert(val != 0);
+    if (LZ4_isLittleEndian()) {
+        if (sizeof(val) == 8) {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+/*-*************************************************************************************************
+* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11.
+* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics
+* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC.
+****************************************************************************************************/
+#         if defined(__clang__) && (__clang_major__ < 10)
+            /* Avoid undefined clang-cl intrinsics issue.
+             * See https://github.com/lz4/lz4/pull/1017 for details. */
+            return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
+#         else
+            /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+            return (unsigned)_tzcnt_u64(val) >> 3;
+#         endif
+#       elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64(&r, (U64)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctzll((U64)val) >> 3;
+#       else
+            const U64 m = 0x0101010101010101ULL;
+            val ^= val - 1;
+            return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
+#       endif
+        } else /* 32 bits */ {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward(&r, (U32)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctz((U32)val) >> 3;
+#       else
+            const U32 m = 0x01010101;
+            return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
+#       endif
+        }
+    } else   /* Big Endian CPU */ {
+        if (sizeof(val)==8) {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clzll((U64)val) >> 3;
+#       else
+#if 1
+            /* this method is probably faster,
+             * but adds a 128 bytes lookup table */
+            static const unsigned char ctz7_tab[128] = {
+                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+            };
+            U64 const mask = 0x0101010101010101ULL;
+            U64 const t = (((val >> 8) - mask) | val) & mask;
+            return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+            /* this method doesn't consume memory space like the previous one,
+             * but it contains several branches,
+             * that may end up slowing execution */
+            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
+            Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
+            Note that this code path is never triggered in 32-bits mode. */
+            unsigned r;
+            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#endif
+#       endif
+        } else /* 32 bits */ {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clz((U32)val) >> 3;
+#       else
+            val >>= 8;
+            val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) |
+              (val + 0x00FF0000)) >> 24;
+            return (unsigned)val ^ 3;
+#       endif
+        }
+    }
+}
+
+
+#define STEPSIZE sizeof(reg_t)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) {
+            pIn+=STEPSIZE; pMatch+=STEPSIZE;
+        } else {
+            return LZ4_NbCommonBytes(diff);
+    }   }
+
+    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/*-************************************
+*  Local Constants
+**************************************/
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/*-************************************
+*  Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
+ *
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Everything concerning the preceding content is
+ *                   in a separate context, pointed to by ctx->dictCtx.
+ *                   ctx->dictionary, ctx->dictSize, and table entries
+ *                   in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
+ */
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+
+/*-************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
+
+
+/*-****************************************
+*  Internal Definitions, used only in Tests
+*******************************************/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize);
+int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int targetOutputSize, int dstCapacity,
+                                     const void* dictStart, size_t dictSize);
+#if defined (__cplusplus)
+}
+#endif
+
+/*-******************************
+*  Compression functions
+********************************/
+LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    if (LZ4_isLittleEndian()) {
+        const U64 prime5bytes = 889523592379ULL;
+        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+    } else {
+        const U64 prime8bytes = 11400714785074694791ULL;
+        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+    }
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType)
+{
+    if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+
+#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT
+    return LZ4_hash4(LZ4_readLE32(p), tableType);
+#else
+    return LZ4_hash4(LZ4_read32(p), tableType);
+#endif
+}
+
+LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: /* fallthrough */
+    case byPtr: { /* illegal! */ assert(0); return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; }
+    }
+}
+
+/* LZ4_putPosition*() : only used in byPtr mode */
+LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h,
+                                  void* tableBase, tableType_t const tableType)
+{
+    const BYTE** const hashTable = (const BYTE**)tableBase;
+    assert(tableType == byPtr); (void)tableType;
+    hashTable[h] = p;
+}
+
+LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType);
+}
+
+/* LZ4_getIndexOnHash() :
+ * Index of match position registered in hash table.
+ * hash position must be calculated by using base+index, or dictBase+index.
+ * Assumption 1 : only valid if tableType == byU32 or byU16.
+ * Assumption 2 : h is presumed valid (within limits of hash table)
+ */
+LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+    if (tableType == byU32) {
+        const U32* const hashTable = (const U32*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-2)));
+        return hashTable[h];
+    }
+    if (tableType == byU16) {
+        const U16* const hashTable = (const U16*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-1)));
+        return hashTable[h];
+    }
+    assert(0); return 0;  /* forbidden case */
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    assert(tableType == byPtr); (void)tableType;
+    { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; }
+}
+
+LZ4_FORCE_INLINE const BYTE*
+LZ4_getPosition(const BYTE* p,
+                const void* tableBase, tableType_t tableType)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType);
+}
+
+LZ4_FORCE_INLINE void
+LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
+           const int inputSize,
+           const tableType_t tableType) {
+    /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+     * therefore safe to use no matter what mode we're in. Otherwise, we figure
+     * out if it's safe to leave as is or whether it needs to be reset.
+     */
+    if ((tableType_t)cctx->tableType != clearedTable) {
+        assert(inputSize >= 0);
+        if ((tableType_t)cctx->tableType != tableType
+          || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU)
+          || ((tableType == byU32) && cctx->currentOffset > 1 GB)
+          || tableType == byPtr
+          || inputSize >= 4 KB)
+        {
+            DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", (void*)cctx);
+            MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+            cctx->currentOffset = 0;
+            cctx->tableType = (U32)clearedTable;
+        } else {
+            DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+        }
+    }
+
+    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
+     * is faster than compressing without a gap.
+     * However, compressing with currentOffset == 0 is faster still,
+     * so we preserve that case.
+     */
+    if (cctx->currentOffset != 0 && tableType == byU32) {
+        DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+        cctx->currentOffset += 64 KB;
+    }
+
+    /* Finally, clear history */
+    cctx->dictCtx = NULL;
+    cctx->dictionary = NULL;
+    cctx->dictSize = 0;
+}
+
+/** LZ4_compress_generic_validated() :
+ *  inlined, to ensure branches are decided at compilation time.
+ *  The following conditions are presumed already validated:
+ *  - source != NULL
+ *  - inputSize > 0
+ */
+LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 int*  inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    int result;
+    const BYTE* ip = (const BYTE*)source;
+
+    U32 const startIndex = cctx->currentOffset;
+    const BYTE* base = (const BYTE*)source - startIndex;
+    const BYTE* lowLimit;
+
+    const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx;
+    const BYTE* const dictionary =
+        dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+    const U32 dictSize =
+        dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+    const U32 dictDelta =
+        (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0;   /* make indexes in dictCtx comparable with indexes in current context */
+
+    int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+    U32 const prefixIdxLimit = startIndex - dictSize;   /* used when dictDirective == dictSmall */
+    const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    /* the dictCtx currentOffset is indexed on the start of the dictionary,
+     * while a dictionary in the current context precedes the currentOffset */
+    const BYTE* dictBase = (dictionary == NULL) ? NULL :
+                           (dictDirective == usingDictCtx) ?
+                            dictionary + dictSize - dictCtx->currentOffset :
+                            dictionary + dictSize - startIndex;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 offset = 0;
+    U32 forwardH;
+
+    DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType);
+    assert(ip != NULL);
+    if (tableType == byU16) assert(inputSize<LZ4_64Klimit);  /* Size too large (not within 64K limit) */
+    if (tableType == byPtr) assert(dictDirective==noDict);   /* only supported use case with byPtr */
+    /* If init conditions are not met, we don't have to mark stream
+     * as having dirty context, since no action was taken yet */
+    if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */
+    assert(acceleration >= 1);
+
+    lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+    /* Update context state */
+    if (dictDirective == usingDictCtx) {
+        /* Subsequent linked blocks can't use the dictionary. */
+        /* Instead, they use the block we just compressed. */
+        cctx->dictCtx = NULL;
+        cctx->dictSize = (U32)inputSize;
+    } else {
+        cctx->dictSize += (U32)inputSize;
+    }
+    cctx->currentOffset += (U32)inputSize;
+    cctx->tableType = (U32)tableType;
+
+    if (inputSize<LZ4_minLength) goto _last_literals;        /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    {   U32 const h = LZ4_hashPosition(ip, tableType);
+        if (tableType == byPtr) {
+            LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr);
+        } else {
+            LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType);
+    }   }
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; ) {
+        const BYTE* match;
+        BYTE* token;
+        const BYTE* filledIp;
+
+        /* Find a match */
+        if (tableType == byPtr) {
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType);
+
+            } while ( (match+LZ4_DISTANCE_MAX < ip)
+                   || (LZ4_read32(match) != LZ4_read32(ip)) );
+
+        } else {   /* byU32, byU16 */
+
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                U32 const current = (U32)(forwardIp - base);
+                U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+                assert(matchIndex <= current);
+                assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                if (dictDirective == usingDictCtx) {
+                    if (matchIndex < startIndex) {
+                        /* there was no match, try the dictionary */
+                        assert(tableType == byU32);
+                        matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                        match = dictBase + matchIndex;
+                        matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else if (dictDirective == usingExtDict) {
+                    if (matchIndex < startIndex) {
+                        DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
+                        assert(startIndex - matchIndex >= MINMATCH);
+                        assert(dictBase);
+                        match = dictBase + matchIndex;
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else {   /* single continuous memory segment */
+                    match = base + matchIndex;
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+                DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex, current - matchIndex);
+                if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; }    /* match outside of valid area */
+                assert(matchIndex < current);
+                if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX))
+                  && (matchIndex+LZ4_DISTANCE_MAX < current)) {
+                    continue;
+                } /* too far */
+                assert((current - matchIndex) <= LZ4_DISTANCE_MAX);  /* match now expected within distance */
+
+                if (LZ4_read32(match) == LZ4_read32(ip)) {
+                    if (maybe_extMem) offset = current - matchIndex;
+                    break;   /* match found */
+                }
+
+            } while(1);
+        }
+
+        /* Catch up */
+        filledIp = ip;
+        assert(ip > anchor); /* this is always true as ip has been advanced before entering the main loop */
+        if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) {
+            do { ip--; match--; } while (((ip > anchor) & (match > lowLimit)) && (unlikely(ip[-1] == match[-1])));
+        }
+
+        /* Encode Literals */
+        {   unsigned const litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputDirective == limitedOutput) &&  /* Check output buffer overflow */
+                (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) {
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+            if ((outputDirective == fillOutput) &&
+                (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) {
+                op--;
+                goto _last_literals;
+            }
+            if (litLength >= RUN_MASK) {
+                unsigned len = litLength - RUN_MASK;
+                *token = (RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy8(op, anchor, op+litLength);
+            op+=litLength;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                        (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
+        }
+
+_next_match:
+        /* at this stage, the following variables must be correctly set :
+         * - ip : at start of LZ operation
+         * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict
+         * - offset : if maybe_ext_memSegment==1 (constant)
+         * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
+         * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
+         */
+
+        if ((outputDirective == fillOutput) &&
+            (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
+            /* the match was too close to the end, rewind and go to last literals */
+            op = token;
+            goto _last_literals;
+        }
+
+        /* Encode Offset */
+        if (maybe_extMem) {   /* static test */
+            DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset, (int)(ip - (const BYTE*)source));
+            assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+            LZ4_writeLE16(op, (U16)offset); op+=2;
+        } else  {
+            DEBUGLOG(6, "             with offset=%u  (same segment)", (U32)(ip - match));
+            assert(ip-match <= LZ4_DISTANCE_MAX);
+            LZ4_writeLE16(op, (U16)(ip - match)); op+=2;
+        }
+
+        /* Encode MatchLength */
+        {   unsigned matchCode;
+
+            if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx)
+              && (lowLimit==dictionary) /* match within extDict */ ) {
+                const BYTE* limit = ip + (dictEnd-match);
+                assert(dictEnd > match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += (size_t)matchCode + MINMATCH;
+                if (ip==limit) {
+                    unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit);
+                    matchCode += more;
+                    ip += more;
+                }
+                DEBUGLOG(6, "             with matchLength=%u starting in extDict", matchCode+MINMATCH);
+            } else {
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += (size_t)matchCode + MINMATCH;
+                DEBUGLOG(6, "             with matchLength=%u", matchCode+MINMATCH);
+            }
+
+            if ((outputDirective) &&    /* Check output buffer overflow */
+                (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) {
+                if (outputDirective == fillOutput) {
+                    /* Match description too long : reduce it */
+                    U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
+                    ip -= matchCode - newMatchCode;
+                    assert(newMatchCode < matchCode);
+                    matchCode = newMatchCode;
+                    if (unlikely(ip <= filledIp)) {
+                        /* We have already filled up to filledIp so if ip ends up less than filledIp
+                         * we have positions in the hash table beyond the current position. This is
+                         * a problem if we reuse the hash table. So we have to remove these positions
+                         * from the hash table.
+                         */
+                        const BYTE* ptr;
+                        DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                        for (ptr = ip; ptr <= filledIp; ++ptr) {
+                            U32 const h = LZ4_hashPosition(ptr, tableType);
+                            LZ4_clearHash(h, cctx->hashTable, tableType);
+                        }
+                    }
+                } else {
+                    assert(outputDirective == limitedOutput);
+                    return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+                }
+            }
+            if (matchCode >= ML_MASK) {
+                *token += ML_MASK;
+                matchCode -= ML_MASK;
+                LZ4_write32(op, 0xFFFFFFFF);
+                while (matchCode >= 4*255) {
+                    op+=4;
+                    LZ4_write32(op, 0xFFFFFFFF);
+                    matchCode -= 4*255;
+                }
+                op += matchCode / 255;
+                *op++ = (BYTE)(matchCode % 255);
+            } else
+                *token += (BYTE)(matchCode);
+        }
+        /* Ensure we have enough space for the last literals. */
+        assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip >= mflimitPlusOne) break;
+
+        /* Fill table */
+        {   U32 const h = LZ4_hashPosition(ip-2, tableType);
+            if (tableType == byPtr) {
+                LZ4_putPositionOnHash(ip-2, h, cctx->hashTable, byPtr);
+            } else {
+                U32 const idx = (U32)((ip-2) - base);
+                LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType);
+        }   }
+
+        /* Test next position */
+        if (tableType == byPtr) {
+
+            match = LZ4_getPosition(ip, cctx->hashTable, tableType);
+            LZ4_putPosition(ip, cctx->hashTable, tableType);
+            if ( (match+LZ4_DISTANCE_MAX >= ip)
+              && (LZ4_read32(match) == LZ4_read32(ip)) )
+            { token=op++; *token=0; goto _next_match; }
+
+        } else {   /* byU32, byU16 */
+
+            U32 const h = LZ4_hashPosition(ip, tableType);
+            U32 const current = (U32)(ip-base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if (dictDirective == usingDictCtx) {
+                if (matchIndex < startIndex) {
+                    /* there was no match, try the dictionary */
+                    assert(tableType == byU32);
+                    matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                    matchIndex += dictDelta;
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;  /* required for match length counter */
+                }
+            } else if (dictDirective==usingExtDict) {
+                if (matchIndex < startIndex) {
+                    assert(dictBase);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;   /* required for match length counter */
+                }
+            } else {   /* single memory segment */
+                match = base + matchIndex;
+            }
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1)
+              && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current))
+              && (LZ4_read32(match) == LZ4_read32(ip)) ) {
+                token=op++;
+                *token=0;
+                if (maybe_extMem) offset = current - matchIndex;
+                DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                            (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source));
+                goto _next_match;
+            }
+        }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRun = (size_t)(iend - anchor);
+        if ( (outputDirective) &&  /* Check output buffer overflow */
+            (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) {
+            if (outputDirective == fillOutput) {
+                /* adapt lastRun to fill 'dst' */
+                assert(olimit >= op);
+                lastRun  = (size_t)(olimit-op) - 1/*token*/;
+                lastRun -= (lastRun + 256 - RUN_MASK) / 256;  /*additional length tokens*/
+            } else {
+                assert(outputDirective == limitedOutput);
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+        }
+        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
+        if (lastRun >= RUN_MASK) {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        LZ4_memcpy(op, anchor, lastRun);
+        ip = anchor + lastRun;
+        op += lastRun;
+    }
+
+    if (outputDirective == fillOutput) {
+        *inputConsumed = (int) (((const char*)ip)-source);
+    }
+    result = (int)(((char*)op) - dest);
+    assert(result > 0);
+    DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result);
+    return result;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time;
+ *  takes care of src == (NULL, 0)
+ *  and forward the rest to LZ4_compress_generic_validated */
+LZ4_FORCE_INLINE int LZ4_compress_generic(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const src,
+                 char* const dst,
+                 const int srcSize,
+                 int *inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int dstCapacity,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i",
+                srcSize, dstCapacity);
+
+    if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; }  /* Unsupported srcSize, too large (or negative) */
+    if (srcSize == 0) {   /* src == NULL supported if srcSize == 0 */
+        if (outputDirective != notLimited && dstCapacity <= 0) return 0;  /* no output, can't write anything */
+        DEBUGLOG(5, "Generating an empty block");
+        assert(outputDirective == notLimited || dstCapacity >= 1);
+        assert(dst != NULL);
+        dst[0] = 0;
+        if (outputDirective == fillOutput) {
+            assert (inputConsumed != NULL);
+            *inputConsumed = 0;
+        }
+        return 1;
+    }
+    assert(src != NULL);
+
+    return LZ4_compress_generic_validated(cctx, src, dst, srcSize,
+                inputConsumed, /* only written into if outputDirective == fillOutput */
+                dstCapacity, outputDirective,
+                tableType, dictDirective, dictIssue, acceleration);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse;
+    assert(ctx != NULL);
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+    if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+/**
+ * LZ4_compress_fast_extState_fastReset() :
+ * A variant of LZ4_compress_fast_extState().
+ *
+ * Using this variant avoids an expensive initialization step. It is only safe
+ * to call if the state buffer is known to be correctly initialized already
+ * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
+ * "correctly initialized").
+ */
+int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = &((LZ4_stream_t*)state)->internal_donotuse;
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+    assert(ctx != NULL);
+
+    if (dstCapacity >= LZ4_compressBound(srcSize)) {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+
+int LZ4_compress_fast(const char* src, char* dest, int srcSize, int dstCapacity, int acceleration)
+{
+    int result;
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctxPtr == NULL) return 0;
+#else
+    LZ4_stream_t ctx;
+    LZ4_stream_t* const ctxPtr = &ctx;
+#endif
+    result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity, acceleration);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1);
+}
+
+
+/* Note!: This function leaves the stream in an unclean/broken state!
+ * It is not safe to subsequently use the same state with a _fastReset() or
+ * _continue() call without resetting it. */
+static int LZ4_compress_destSize_extState_internal(LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration)
+{
+    void* const s = LZ4_initStream(state, sizeof (*state));
+    assert(s != NULL); (void)s;
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {  /* compression success is guaranteed */
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, acceleration);
+    } else {
+        if (*srcSizePtr < LZ4_64Klimit) {
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, acceleration);
+    }   }
+}
+
+int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration)
+{
+    int const r = LZ4_compress_destSize_extState_internal((LZ4_stream_t*)state, src, dst, srcSizePtr, targetDstSize, acceleration);
+    /* clean the state on exit */
+    LZ4_initStream(state, sizeof (LZ4_stream_t));
+    return r;
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctx == NULL) return 0;
+#else
+    LZ4_stream_t ctxBody;
+    LZ4_stream_t* const ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState_internal(ctx, src, dst, srcSizePtr, targetDstSize, 1);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/*-******************************
+*  Streaming functions
+********************************/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+    LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
+    DEBUGLOG(4, "LZ4_createStream %p", (void*)lz4s);
+    if (lz4s == NULL) return NULL;
+    LZ4_initStream(lz4s, sizeof(*lz4s));
+    return lz4s;
+}
+#endif
+
+static size_t LZ4_stream_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+    typedef struct { char c; LZ4_stream_t t; } t_a;
+    return sizeof(t_a) - sizeof(LZ4_stream_t);
+#else
+    return 1;  /* effectively disabled */
+#endif
+}
+
+LZ4_stream_t* LZ4_initStream (void* buffer, size_t size)
+{
+    DEBUGLOG(5, "LZ4_initStream");
+    if (buffer == NULL) { return NULL; }
+    if (size < sizeof(LZ4_stream_t)) { return NULL; }
+    if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL;
+    MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
+    return (LZ4_stream_t*)buffer;
+}
+
+/* resetStream is now deprecated,
+ * prefer initStream() which is more general */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", (void*)LZ4_stream);
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+}
+
+void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
+    LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    if (!LZ4_stream) return 0;   /* support free on NULL */
+    DEBUGLOG(5, "LZ4_freeStream %p", (void*)LZ4_stream);
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+#endif
+
+
+typedef enum { _ld_fast, _ld_slow } LoadDict_mode_e;
+#define HASH_UNIT sizeof(reg_t)
+int LZ4_loadDict_internal(LZ4_stream_t* LZ4_dict,
+                    const char* dictionary, int dictSize,
+                    LoadDict_mode_e _ld)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+    const tableType_t tableType = byU32;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    U32 idx32;
+
+    DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, (void*)dictionary, (void*)LZ4_dict);
+
+    /* It's necessary to reset the context,
+     * and not just continue it with prepareTable()
+     * to avoid any risk of generating overflowing matchIndex
+     * when compressing using this dictionary */
+    LZ4_resetStream(LZ4_dict);
+
+    /* We always increment the offset by 64 KB, since, if the dict is longer,
+     * we truncate it to the last 64k, and if it's shorter, we still want to
+     * advance by a whole window length so we can provide the guarantee that
+     * there are only valid offsets in the window, which allows an optimization
+     * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+     * dictionary isn't a full 64k. */
+    dict->currentOffset += 64 KB;
+
+    if (dictSize < (int)HASH_UNIT) {
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->tableType = (U32)tableType;
+    idx32 = dict->currentOffset - dict->dictSize;
+
+    while (p <= dictEnd-HASH_UNIT) {
+        U32 const h = LZ4_hashPosition(p, tableType);
+        /* Note: overwriting => favors positions end of dictionary */
+        LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+        p+=3; idx32+=3;
+    }
+
+    if (_ld == _ld_slow) {
+        /* Fill hash table with additional references, to improve compression capability */
+        p = dict->dictionary;
+        idx32 = dict->currentOffset - dict->dictSize;
+        while (p <= dictEnd-HASH_UNIT) {
+            U32 const h = LZ4_hashPosition(p, tableType);
+            U32 const limit = dict->currentOffset - 64 KB;
+            if (LZ4_getIndexOnHash(h, dict->hashTable, tableType) <= limit) {
+                /* Note: not overwriting => favors positions beginning of dictionary */
+                LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+            }
+            p++; idx32++;
+        }
+    }
+
+    return (int)dict->dictSize;
+}
+
+int LZ4_loadDict(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_fast);
+}
+
+int LZ4_loadDictSlow(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_slow);
+}
+
+void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream)
+{
+    const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL :
+        &(dictionaryStream->internal_donotuse);
+
+    DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)",
+             (void*)workingStream, (void*)dictionaryStream,
+             dictCtx != NULL ? dictCtx->dictSize : 0);
+
+    if (dictCtx != NULL) {
+        /* If the current offset is zero, we will never look in the
+         * external dictionary context, since there is no value a table
+         * entry can take that indicate a miss. In that case, we need
+         * to bump the offset to something non-zero.
+         */
+        if (workingStream->internal_donotuse.currentOffset == 0) {
+            workingStream->internal_donotuse.currentOffset = 64 KB;
+        }
+
+        /* Don't actually attach an empty dictionary.
+         */
+        if (dictCtx->dictSize == 0) {
+            dictCtx = NULL;
+        }
+    }
+    workingStream->internal_donotuse.dictCtx = dictCtx;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
+{
+    assert(nextSize >= 0);
+    if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) {   /* potential ptrdiff_t overflow (32-bits mode) */
+        /* rescale hash table */
+        U32 const delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        DEBUGLOG(4, "LZ4_renormDictT");
+        for (i=0; i<LZ4_HASH_SIZE_U32; i++) {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
+                                const char* source, char* dest,
+                                int inputSize, int maxOutputSize,
+                                int acceleration)
+{
+    const tableType_t tableType = byU32;
+    LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
+    const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL;
+
+    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize);
+
+    LZ4_renormDictT(streamPtr, inputSize);   /* fix index overflow */
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+
+    /* invalidate tiny dictionaries */
+    if ( (streamPtr->dictSize < 4)     /* tiny dictionary : not enough for a hash */
+      && (dictEnd != source)           /* prefix mode */
+      && (inputSize > 0)               /* tolerance : don't lose history, in case next invocation would use prefix mode */
+      && (streamPtr->dictCtx == NULL)  /* usingDictCtx */
+      ) {
+        DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, (void*)streamPtr->dictionary);
+        /* remove dictionary existence from history, to employ faster prefix mode */
+        streamPtr->dictSize = 0;
+        streamPtr->dictionary = (const BYTE*)source;
+        dictEnd = source;
+    }
+
+    /* Check overlapping input/dictionary space */
+    {   const char* const sourceEnd = source + inputSize;
+        if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == source) {
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
+        else
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
+    }
+
+    /* external dictionary mode */
+    {   int result;
+        if (streamPtr->dictCtx) {
+            /* We depend here on the fact that dictCtx'es (produced by
+             * LZ4_loadDict) guarantee that their tables contain no references
+             * to offsets between dictCtx->currentOffset - 64 KB and
+             * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+             * to use noDictIssue even when the dict isn't a full 64 KB.
+             */
+            if (inputSize > 4 KB) {
+                /* For compressing large blobs, it is faster to pay the setup
+                 * cost to copy the dictionary's tables into the active context,
+                 * so that the compression loop is only looking into one table.
+                 */
+                LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
+            }
+        } else {  /* small data <= 4 KB */
+            if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            }
+        }
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force-test external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize)
+{
+    LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse;
+    int result;
+
+    LZ4_renormDictT(streamPtr, srcSize);
+
+    if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1);
+    } else {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+    }
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)srcSize;
+
+    return result;
+}
+
+
+/*! LZ4_saveDict() :
+ *  If previously compressed data block is not guaranteed to remain available at its memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable,
+ *         one can therefore call LZ4_compress_fast_continue() right after.
+ * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+ */
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+
+    DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, (void*)safeBuffer);
+
+    if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; }
+
+    if (safeBuffer == NULL) assert(dictSize == 0);
+    if (dictSize > 0) {
+        const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+        assert(dict->dictionary);
+        LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
+    }
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*-*******************************
+ *  Decompression functions
+ ********************************/
+
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#undef MIN
+#define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
+
+
+/* variant for decompress_unsafe()
+ * does not know end of input
+ * presumes input is well formed
+ * note : will consume at least one byte */
+static size_t read_long_length_no_check(const BYTE** pp)
+{
+    size_t b, l = 0;
+    do { b = **pp; (*pp)++; l += b; } while (b==255);
+    DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1)
+    return l;
+}
+
+/* core decoder variant for LZ4_decompress_fast*()
+ * for legacy support only : these entry points are deprecated.
+ * - Presumes input is correctly formed (no defense vs malformed inputs)
+ * - Does not know input size (presume input buffer is "large enough")
+ * - Decompress a full block (only)
+ * @return : nb of bytes read from input.
+ * Note : this variant is not optimized for speed, just for maintenance.
+ *        the goal is to remove support of decompress_fast*() variants by v2.0
+**/
+LZ4_FORCE_INLINE int
+LZ4_decompress_unsafe_generic(
+                 const BYTE* const istart,
+                 BYTE* const ostart,
+                 int decompressedSize,
+
+                 size_t prefixSize,
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note: =0 if dictStart==NULL */
+                 )
+{
+    const BYTE* ip = istart;
+    BYTE* op = (BYTE*)ostart;
+    BYTE* const oend = ostart + decompressedSize;
+    const BYTE* const prefixStart = ostart - prefixSize;
+
+    DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
+    if (dictStart == NULL) assert(dictSize == 0);
+
+    while (1) {
+        /* start new sequence */
+        unsigned token = *ip++;
+
+        /* literals */
+        {   size_t ll = token >> ML_BITS;
+            if (ll==15) {
+                /* long literal length */
+                ll += read_long_length_no_check(&ip);
+            }
+            if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */
+            LZ4_memmove(op, ip, ll); /* support in-place decompression */
+            op += ll;
+            ip += ll;
+            if ((size_t)(oend-op) < MFLIMIT) {
+                if (op==oend) break;  /* end of block */
+                DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op);
+                /* incorrect end of block :
+                 * last match must start at least MFLIMIT==12 bytes before end of output block */
+                return -1;
+        }   }
+
+        /* match */
+        {   size_t ml = token & 15;
+            size_t const offset = LZ4_readLE16(ip);
+            ip+=2;
+
+            if (ml==15) {
+                /* long literal length */
+                ml += read_long_length_no_check(&ip);
+            }
+            ml += MINMATCH;
+
+            if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */
+
+            {   const BYTE* match = op - offset;
+
+                /* out of range */
+                if (offset > (size_t)(op - prefixStart) + dictSize) {
+                    DEBUGLOG(6, "offset out of range");
+                    return -1;
+                }
+
+                /* check special case : extDict */
+                if (offset > (size_t)(op - prefixStart)) {
+                    /* extDict scenario */
+                    const BYTE* const dictEnd = dictStart + dictSize;
+                    const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart));
+                    size_t const extml = (size_t)(dictEnd - extMatch);
+                    if (extml > ml) {
+                        /* match entirely within extDict */
+                        LZ4_memmove(op, extMatch, ml);
+                        op += ml;
+                        ml = 0;
+                    } else {
+                        /* match split between extDict & prefix */
+                        LZ4_memmove(op, extMatch, extml);
+                        op += extml;
+                        ml -= extml;
+                    }
+                    match = prefixStart;
+                }
+
+                /* match copy - slow variant, supporting overlap copy */
+                {   size_t u;
+                    for (u=0; u<ml; u++) {
+                        op[u] = match[u];
+            }   }   }
+            op += ml;
+            if ((size_t)(oend-op) < LASTLITERALS) {
+                DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", oend-op);
+                /* incorrect end of block :
+                 * last match must stop at least LASTLITERALS==5 bytes before end of output block */
+                return -1;
+            }
+        } /* match */
+    } /* main loop */
+    return (int)(ip - istart);
+}
+
+
+/* Read the variable-length literal or match length.
+ *
+ * @ip : input pointer
+ * @ilimit : position after which if length is not decoded, the input is necessarily corrupted.
+ * @initial_check - check ip >= ipmax before start of loop.  Returns initial_error if so.
+ * @error (output) - error code.  Must be set to 0 before call.
+**/
+typedef size_t Rvl_t;
+static const Rvl_t rvl_error = (Rvl_t)(-1);
+LZ4_FORCE_INLINE Rvl_t
+read_variable_length(const BYTE** ip, const BYTE* ilimit,
+                     int initial_check)
+{
+    Rvl_t s, length = 0;
+    assert(ip != NULL);
+    assert(*ip !=  NULL);
+    assert(ilimit != NULL);
+    if (initial_check && unlikely((*ip) >= ilimit)) {    /* read limit reached */
+        return rvl_error;
+    }
+    s = **ip;
+    (*ip)++;
+    length += s;
+    if (unlikely((*ip) > ilimit)) {    /* read limit reached */
+        return rvl_error;
+    }
+    /* accumulator overflow detection (32-bit mode only) */
+    if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
+        return rvl_error;
+    }
+    if (likely(s != 255)) return length;
+    do {
+        s = **ip;
+        (*ip)++;
+        length += s;
+        if (unlikely((*ip) > ilimit)) {    /* read limit reached */
+            return rvl_error;
+        }
+        /* accumulator overflow detection (32-bit mode only) */
+        if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
+            return rvl_error;
+        }
+    } while (s == 255);
+
+    return length;
+}
+
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get inlined,
+ *  in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+                 const char* const src,
+                 char* const dst,
+                 int srcSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+                 earlyEnd_directive partialDecoding,  /* full, partial */
+                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    if ((src == NULL) || (outputSize < 0)) { return -1; }
+
+    {   const BYTE* ip = (const BYTE*) src;
+        const BYTE* const iend = ip + srcSize;
+
+        BYTE* op = (BYTE*) dst;
+        BYTE* const oend = op + outputSize;
+        BYTE* cpy;
+
+        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+        const int checkOffset = (dictSize < (int)(64 KB));
+
+
+        /* Set up the "end" pointers for the shortcut. */
+        const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
+        const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
+
+        const BYTE* match;
+        size_t offset;
+        unsigned token;
+        size_t length;
+
+
+        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+        /* Special cases */
+        assert(lowPrefix <= op);
+        if (unlikely(outputSize==0)) {
+            /* Empty output buffer */
+            if (partialDecoding) return 0;
+            return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+        }
+        if (unlikely(srcSize==0)) { return -1; }
+
+    /* LZ4_FAST_DEC_LOOP:
+     * designed for modern OoO performance cpus,
+     * where copying reliably 32-bytes is preferable to an unpredictable branch.
+     * note : fast loop may show a regression for some client arm chips. */
+#if LZ4_FAST_DEC_LOOP
+        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+            DEBUGLOG(6, "move to safe decode loop");
+            goto safe_decode;
+        }
+
+        /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */
+        DEBUGLOG(6, "using fast decode loop");
+        while (1) {
+            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+            assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+            DEBUGLOG(7, "blockPos%6u: litLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length);
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
+                if (addl == rvl_error) {
+                    DEBUGLOG(6, "error reading long literal length");
+                    goto _output_error;
+                }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+
+                /* copy literals */
+                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+                if ((op+length>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+                LZ4_wildCopy32(op, ip, op+length);
+                ip += length; op += length;
+            } else if (ip <= iend-(16 + 1/*max lit + offset + nextToken*/)) {
+                /* We don't need to check oend, since we check it once for each loop below */
+                DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+                /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */
+                LZ4_memcpy(op, ip, 16);
+                ip += length; op += length;
+            } else {
+                goto safe_literal_copy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            DEBUGLOG(6, "blockPos%6u: offset = %u", (unsigned)(op-(BYTE*)dst), (unsigned)offset);
+            match = op - offset;
+            assert(match <= op);  /* overflow check */
+
+            /* get matchlength */
+            length = token & ML_MASK;
+            DEBUGLOG(7, "  match length token = %u (len==%u)", (unsigned)length, (unsigned)length+MINMATCH);
+
+            if (length == ML_MASK) {
+                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+                if (addl == rvl_error) {
+                    DEBUGLOG(5, "error reading long match length");
+                    goto _output_error;
+                }
+                length += addl;
+                length += MINMATCH;
+                DEBUGLOG(7, "  long match length == %u", (unsigned)length);
+                if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+            } else {
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    DEBUGLOG(7, "moving to safe_match_copy (ml==%u)", (unsigned)length);
+                    goto safe_match_copy;
+                }
+
+                /* Fastpath check: skip LZ4_wildCopy32 when true */
+                if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+                    if (offset >= 8) {
+                        assert(match >= lowPrefix);
+                        assert(match <= op);
+                        assert(op + 18 <= oend);
+
+                        LZ4_memcpy(op, match, 8);
+                        LZ4_memcpy(op+8, match+8, 8);
+                        LZ4_memcpy(op+16, match+16, 2);
+                        op += length;
+                        continue;
+            }   }   }
+
+            if ( checkOffset && (unlikely(match + dictSize < lowPrefix)) ) {
+                DEBUGLOG(5, "Error : pos=%zi, offset=%zi => outside buffers", op-lowPrefix, op-match);
+                goto _output_error;
+            }
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                assert(dictEnd != NULL);
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) {
+                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                        length = MIN(length, (size_t)(oend-op));
+                    } else {
+                        DEBUGLOG(6, "end-of-block condition violated")
+                        goto _output_error;
+                }   }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) { *op++ = *copyFrom++; }
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            assert((op <= oend) && (oend-op >= 32));
+            if (unlikely(offset<16)) {
+                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            } else {
+                LZ4_wildCopy32(op, match, cpy);
+            }
+
+            op = cpy;   /* wildcopy correction */
+        }
+    safe_decode:
+#endif
+
+        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+        DEBUGLOG(6, "using safe decode loop");
+        while (1) {
+            assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+            DEBUGLOG(7, "blockPos%6u: litLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length);
+
+            /* A two-stage shortcut for the most common case:
+             * 1) If the literal length is 0..14, and there is enough space,
+             * enter the shortcut and copy 16 bytes on behalf of the literals
+             * (in the fast mode, only 8 bytes can be safely copied this way).
+             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+             * manner; but we ensure that there's enough space in the output for
+             * those 18 bytes earlier, upon entering the shortcut (in other words,
+             * there is a combined check for both stages).
+             */
+            if ( (length != RUN_MASK)
+                /* strictly "less than" on input, to re-enter the loop with at least one byte */
+              && likely((ip < shortiend) & (op <= shortoend)) ) {
+                /* Copy the literals */
+                LZ4_memcpy(op, ip, 16);
+                op += length; ip += length;
+
+                /* The second stage: prepare for match copying, decode full info.
+                 * If it doesn't work out, the info won't be wasted. */
+                length = token & ML_MASK; /* match length */
+                DEBUGLOG(7, "blockPos%6u: matchLength token = %u (len=%u)", (unsigned)(op-(BYTE*)dst), (unsigned)length, (unsigned)length + 4);
+                offset = LZ4_readLE16(ip); ip += 2;
+                match = op - offset;
+                assert(match <= op); /* check overflow */
+
+                /* Do not deal with overlapping matches. */
+                if ( (length != ML_MASK)
+                  && (offset >= 8)
+                  && (dict==withPrefix64k || match >= lowPrefix) ) {
+                    /* Copy the match. */
+                    LZ4_memcpy(op + 0, match + 0, 8);
+                    LZ4_memcpy(op + 8, match + 8, 8);
+                    LZ4_memcpy(op +16, match +16, 2);
+                    op += length + MINMATCH;
+                    /* Both stages worked, load the next token. */
+                    continue;
+                }
+
+                /* The second stage didn't work out, but the info is ready.
+                 * Propel it right to the point of match copying. */
+                goto _copy_match;
+            }
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+            }
+
+#if LZ4_FAST_DEC_LOOP
+        safe_literal_copy:
+#endif
+            /* copy literals */
+            cpy = op+length;
+
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) {
+                /* We've either hit the input parsing restriction or the output parsing restriction.
+                 * In the normal scenario, decoding a full block, it must be the last sequence,
+                 * otherwise it's an error (invalid input or dimensions).
+                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+                 */
+                if (partialDecoding) {
+                    /* Since we are partial decoding we may be in this block because of the output parsing
+                     * restriction, which is not valid since the output buffer is allowed to be undersized.
+                     */
+                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of input.
+                     */
+                    if (ip+length > iend) {
+                        length = (size_t)(iend-ip);
+                        cpy = op + length;
+                    }
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of output space.
+                     */
+                    if (cpy > oend) {
+                        cpy = oend;
+                        assert(op<=oend);
+                        length = (size_t)(oend-op);
+                    }
+                } else {
+                     /* We must be on the last sequence (or invalid) because of the parsing limitations
+                      * so check that we exactly consume the input and don't overrun the output buffer.
+                      */
+                    if ((ip+length != iend) || (cpy > oend)) {
+                        DEBUGLOG(5, "should have been last run of literals")
+                        DEBUGLOG(5, "ip(%p) + length(%i) = %p != iend (%p)", (void*)ip, (int)length, (void*)(ip+length), (void*)iend);
+                        DEBUGLOG(5, "or cpy(%p) > (oend-MFLIMIT)(%p)", (void*)cpy, (void*)(oend-MFLIMIT));
+                        DEBUGLOG(5, "after writing %u bytes / %i bytes available", (unsigned)(op-(BYTE*)dst), outputSize);
+                        goto _output_error;
+                    }
+                }
+                LZ4_memmove(op, ip, length);  /* supports overlapping memory regions, for in-place decompression scenarios */
+                ip += length;
+                op += length;
+                /* Necessarily EOF when !partialDecoding.
+                 * When partialDecoding, it is EOF if we've either
+                 * filled the output buffer or
+                 * can't proceed with reading an offset for following match.
+                 */
+                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+                    break;
+                }
+            } else {
+                LZ4_wildCopy8(op, ip, cpy);   /* can overwrite up to 8 bytes beyond cpy */
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+            DEBUGLOG(7, "blockPos%6u: matchLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length);
+
+    _copy_match:
+            if (length == ML_MASK) {
+                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            }
+            length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+        safe_match_copy:
+#endif
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                assert(dictEnd != NULL);
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+            assert(match >= lowPrefix);
+
+            /* copy match within block */
+            cpy = op + length;
+
+            /* partialDecoding : may end anywhere within the block */
+            assert(op<=oend);
+            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                size_t const mlen = MIN(length, (size_t)(oend-op));
+                const BYTE* const matchEnd = match + mlen;
+                BYTE* const copyEnd = op + mlen;
+                if (matchEnd > op) {   /* overlap copy */
+                    while (op < copyEnd) { *op++ = *match++; }
+                } else {
+                    LZ4_memcpy(op, match, mlen);
+                }
+                op = copyEnd;
+                if (op == oend) { break; }
+                continue;
+            }
+
+            if (unlikely(offset<8)) {
+                LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
+                op[0] = match[0];
+                op[1] = match[1];
+                op[2] = match[2];
+                op[3] = match[3];
+                match += inc32table[offset];
+                LZ4_memcpy(op+4, match, 4);
+                match -= dec64table[offset];
+            } else {
+                LZ4_memcpy(op, match, 8);
+                match += 8;
+            }
+            op += 8;
+
+            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+                if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+                if (op < oCopyLimit) {
+                    LZ4_wildCopy8(op, match, oCopyLimit);
+                    match += oCopyLimit - op;
+                    op = oCopyLimit;
+                }
+                while (op < cpy) { *op++ = *match++; }
+            } else {
+                LZ4_memcpy(op, match, 8);
+                if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); }
+            }
+            op = cpy;   /* wildcopy correction */
+        }
+
+        /* end of decoding */
+        DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+        return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+
+        /* Overflow error detected */
+    _output_error:
+        return (int) (-(((const char*)ip)-src))-1;
+    }
+}
+
+
+/*===== Instantiate the API decoding functions. =====*/
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+                                  decode_full_block, noDict,
+                                  (BYTE*)dest, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                  partial_decode,
+                                  noDict, (BYTE*)dst, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    DEBUGLOG(5, "LZ4_decompress_fast");
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                0, NULL, 0);
+}
+
+/*===== Instantiate a few more decoding cases, used more than once. =====*/
+
+LZ4_FORCE_O2 /* Exported, an obsolete API function. */
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/* Another obsolete API function, paired with the previous one. */
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                               size_t prefixSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity,
+                                               size_t prefixSize)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize)
+{
+    DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict");
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int targetOutputSize, int dstCapacity,
+                                     const void* dictStart, size_t dictSize)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
+                                       const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                0, (const BYTE*)dictStart, dictSize);
+}
+
+/* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+LZ4_FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+/*===== streaming decompression functions =====*/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal));
+    return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    if (LZ4_stream == NULL) { return 0; }  /* support free on NULL */
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    lz4sd->prefixSize = (size_t)dictSize;
+    if (dictSize) {
+        assert(dictionary != NULL);
+        lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    } else {
+        lz4sd->prefixEnd = (const BYTE*) dictionary;
+    }
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+LZ4_FORCE_O2
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+
+    if (lz4sd->prefixSize == 0) {
+        /* The first call, no dictionary yet. */
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd = (BYTE*)dest + result;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        /* They're rolling the current segment. */
+        if (lz4sd->prefixSize >= 64 KB - 1)
+            result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        else if (lz4sd->extDictSize == 0)
+            result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize,
+                                                         lz4sd->prefixSize);
+        else
+            result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)result;
+        lz4sd->prefixEnd  += result;
+    } else {
+        /* The buffer wraps around, or they're switching to another buffer. */
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize,
+                                                  lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+LZ4_FORCE_O2 int
+LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* const lz4sd =
+        (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse);
+    int result;
+
+    DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
+    assert(originalSize >= 0);
+
+    if (lz4sd->prefixSize == 0) {
+        DEBUGLOG(5, "first invocation : no prefix nor extDict");
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_fast(source, dest, originalSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd = (BYTE*)dest + originalSize;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        DEBUGLOG(5, "continue using existing prefix");
+        result = LZ4_decompress_unsafe_generic(
+                        (const BYTE*)source, (BYTE*)dest, originalSize,
+                        lz4sd->prefixSize,
+                        lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    } else {
+        DEBUGLOG(5, "prefix becomes extDict");
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_fast_extDict(source, dest, originalSize,
+                                             lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0 || dictStart+dictSize == dest)
+        return LZ4_decompress_unsafe_generic(
+                        (const BYTE*)source, (BYTE*)dest, originalSize,
+                        (size_t)dictSize, NULL, 0);
+    assert(dictSize >= 0);
+    return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize);
+}
+
+
+/*=*************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+}
+int LZ4_compress(const char* src, char* dest, int srcSize)
+{
+    return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
+}
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+}
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1);
+}
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1);
+}
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1);
+}
+
+/*
+These decompression functions are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_fast(source, dest, outputSize);
+}
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+{
+    return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+}
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); }
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    (void)inputBuffer;
+    LZ4_resetStream((LZ4_stream_t*)state);
+    return 0;
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+void* LZ4_create (char* inputBuffer)
+{
+    (void)inputBuffer;
+    return LZ4_createStream();
+}
+#endif
+
+char* LZ4_slideInputBuffer (void* state)
+{
+    /* avoid const char * -> char * conversion warning */
+    return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
diff --git a/tracegrind/lz4.h b/tracegrind/lz4.h
new file mode 100644
index 000000000..7f2a89d40
--- /dev/null
+++ b/tracegrind/lz4.h
@@ -0,0 +1,888 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (c) Yann Collet. All rights reserved.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LZ4_H_2983827168210
+#define LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#if !LZ4_FREESTANDING
+#include <stddef.h>   /* size_t */
+#endif
+
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*! LZ4_FREESTANDING :
+ *  When this macro is set to 1, it enables "freestanding mode" that is
+ *  suitable for typical freestanding environment which doesn't support
+ *  standard C library.
+ *
+ *  - LZ4_FREESTANDING is a compile-time switch.
+ *  - It requires the following macros to be defined:
+ *    LZ4_memcpy, LZ4_memmove, LZ4_memset.
+ *  - It only enables LZ4/HC functions which don't use heap.
+ *    All LZ4F_* functions are not supported.
+ *  - See tests/freestanding.c to check its basic setup.
+ */
+#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
+#  define LZ4_HEAPMODE 0
+#  define LZ4HC_HEAPMODE 0
+#  define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
+#  if !defined(LZ4_memcpy)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
+#  endif
+#  if !defined(LZ4_memset)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
+#  endif
+#  if !defined(LZ4_memmove)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
+#  endif
+#elif ! defined(LZ4_FREESTANDING)
+#  define LZ4_FREESTANDING 0
+#endif
+
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR   10    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)  /* requires v1.7.3+ */
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version; requires v1.3.0+ */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version; requires v1.7.5+ */
+
+
+/*-************************************
+*  Tuning memory usage
+**************************************/
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Can be selected at compile time, by setting LZ4_MEMORY_USAGE.
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB)
+ * Increasing memory usage improves compression ratio, generally at the cost of speed.
+ * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into most L1 caches.
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+#endif
+
+/* These are absolute limits, they should not be changed by users */
+#define LZ4_MEMORY_USAGE_MIN 10
+#define LZ4_MEMORY_USAGE_DEFAULT 14
+#define LZ4_MEMORY_USAGE_MAX 20
+
+#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
+#  error "LZ4_MEMORY_USAGE is too small !"
+#endif
+
+#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
+#  error "LZ4_MEMORY_USAGE is too large !"
+#endif
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+ *  It also runs faster, so it's a recommended setting.
+ *  If the function cannot compress 'src' into a more limited 'dst' budget,
+ *  compression stops *immediately*, and the function result is zero.
+ *  In which case, 'dst' content is undefined (invalid).
+ *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+ *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+ *                or 0 if compression fails
+ * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ */
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ * @compressedSize : is the exact complete size of the compressed block.
+ * @dstCapacity : is the size of destination buffer (which must be already allocated),
+ *                presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ * Note 1 : This function is protected against malicious data packets :
+ *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+ *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+ *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+ * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+ *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+ *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'dstCapacity'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : in+out parameter. Initially contains size of input.
+ *               Will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity)
+ *           or 0 if compression fails.
+ *
+ * Note : 'targetDstSize' must be >= 1, because it's the smallest valid lz4 payload.
+ *
+ * Note 2:from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+):
+ *        the produced compressed content could, in rare circumstances,
+ *        require to be decompressed into a destination buffer
+ *        larger by at least 1 byte than decompressesSize.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+/*!
+ Note about RC_INVOKED
+
+ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio).
+   https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
+
+ - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
+   and reports warning "RC4011: identifier truncated".
+
+ - To eliminate the warning, we surround long preprocessor symbol with
+   "#if !defined(RC_INVOKED) ... #endif" block that means
+   "skip this block when rc.exe is trying to read it".
+*/
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 itself accepts any input as dictionary, dictionary efficiency is also a topic.
+ *  When in doubt, employ the Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_loadDictSlow() : v1.10.0+
+ *  Same as LZ4_loadDict(),
+ *  but uses a bit more cpu to reference the dictionary content more thoroughly.
+ *  This is expected to slightly improve compression ratio.
+ *  The extra-cpu cost is likely worth it if the dictionary is re-used across multiple sessions.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+ */
+LZ4LIB_API int LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_attach_dictionary() : stable since v1.10.0
+ *
+ *  This allows efficient re-use of a static dictionary multiple times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references @dictionaryStream in-place.
+ *
+ *  Several assumptions are made about the state of @dictionaryStream.
+ *  Currently, only states which have been prepared by LZ4_loadDict() or
+ *  LZ4_loadDictSlow() should be expected to work.
+ *
+ *  Alternatively, the provided @dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared.
+ * @dictionaryStream stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the compression session.
+ *
+ *  Note: there is no equivalent LZ4_attach_*() method on the decompression side
+ *  because there is no initialization cost, hence no need to share the cost across multiple sessions.
+ *  To decompress LZ4 blocks using dictionary, attached or not,
+ *  just employ the regular LZ4_setStreamDecode() for streaming,
+ *  or the stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
+ */
+LZ4LIB_API void
+LZ4_attach_dictionary(LZ4_stream_t* workingStream,
+                const LZ4_stream_t* dictionaryStream);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_safe_continue() :
+ *  This decoding function allows decompression of consecutive blocks in "streaming" mode.
+ *  The difference with the usual independent blocks is that
+ *  new blocks are allowed to find references into former blocks.
+ *  A block is an unsplittable entity, and must be presented entirely to the decompression function.
+ *  LZ4_decompress_safe_continue() only accepts one block at a time.
+ *  It's modeled after `LZ4_decompress_safe()` and behaves similarly.
+ *
+ * @LZ4_streamDecode : decompression state, tracking the position in memory of past data
+ * @compressedSize : exact complete size of one compressed block.
+ * @dstCapacity : size of destination buffer (which must be already allocated),
+ *                must be an upper bound of decompressed size.
+ * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ *
+ *  The last 64KB of previously decoded data *must* remain available and unmodified
+ *  at the memory position where they were previously decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int
+LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* src, char* dst,
+                        int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_safe_usingDict() :
+ *  Works the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue()
+ *  However, it's stateless: it doesn't need any LZ4_streamDecode_t state.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_usingDict(const char* src, char* dst,
+                              int srcSize, int dstCapacity,
+                              const char* dictStart, int dictSize);
+
+/*! LZ4_decompress_safe_partial_usingDict() :
+ *  Behaves the same as LZ4_decompress_safe_partial()
+ *  with the added ability to specify a memory segment for past data.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
+                                      int compressedSize,
+                                      int targetOutputSize, int maxOutputSize,
+                                      const char* dictStart, int dictSize);
+
+#endif /* LZ4_H_2983827168210 */
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef LZ4_STATIC_3504398509
+#define LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+# define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+# define LZ4LIB_STATIC_API
+#endif
+
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_compress_destSize_extState() : introduced in v1.10.0
+ *  Same as LZ4_compress_destSize(), but using an externally allocated state.
+ *  Also: exposes @acceleration
+ */
+int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration);
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly constrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's not compressed.
+ * This can happen when data is not compressible (already compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with both
+ * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
+ * and data expansion, which can happen when input is not compressible.
+ * As a consequence, buffer size requirements are much higher,
+ * and memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply this limit.
+ *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
+ *   so it's a reasonable trick when inputs are known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
+ *   in which case, the return code will be 0 (zero).
+ *   The caller must be ready for these cases to happen,
+ *   and typically design a backup scheme to send data uncompressed.
+ * The combination of both techniques can significantly reduce
+ * the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
+ * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
+ * so it's possible to reduce memory requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
+
+#endif   /* LZ4_STATIC_3504398509 */
+#endif   /* LZ4_STATIC_LINKING_ONLY */
+
+
+
+#ifndef LZ4_H_98237428734687
+#define LZ4_H_98237428734687
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef int8_t         LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef uint16_t       LZ4_u16;
+  typedef uint32_t       LZ4_u32;
+#else
+  typedef   signed char  LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef unsigned short LZ4_u16;
+  typedef unsigned int   LZ4_u32;
+#endif
+
+/*! LZ4_stream_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_stream_t object.
+**/
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
+    const LZ4_byte* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    LZ4_u32 currentOffset;
+    LZ4_u32 tableType;
+    LZ4_u32 dictSize;
+    /* Implicit padding to ensure structure is aligned */
+};
+
+#define LZ4_STREAM_MINSIZE  ((1UL << (LZ4_MEMORY_USAGE)) + 32)  /* static size, for inter-version compatibility */
+union LZ4_stream_u {
+    char minStateSize[LZ4_STREAM_MINSIZE];
+    LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+**/
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* stateBuffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t object.
+**/
+typedef struct {
+    const LZ4_byte* externalDict;
+    const LZ4_byte* prefixEnd;
+    size_t extDictSize;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#define LZ4_STREAMDECODE_MINSIZE 32
+union LZ4_streamDecode_u {
+    char minStateSize[LZ4_STREAMDECODE_MINSIZE];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#    define LZ4_DEPRECATED(message)   /* disabled */
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating towards LZ4_decompress_safe_continue() instead. "
+               "Note that the contract will change (requires block's compressed size, instead of decompressed size)")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+
+#endif /* LZ4_H_98237428734687 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/tracegrind/tg_lz4.c b/tracegrind/tg_lz4.c
new file mode 100644
index 000000000..7fa24f0e1
--- /dev/null
+++ b/tracegrind/tg_lz4.c
@@ -0,0 +1,95 @@
+/*
+ * LZ4 compression wrapper for Tracegrind.
+ * Uses vendored LZ4 library adapted for Valgrind (no libc).
+ *
+ * BSD 2-Clause License - see lz4.c for full license.
+ */
+
+#include "pub_tool_basics.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_mallocfree.h"
+
+#include "tg_lz4.h"
+
+/*------------------------------------------------------------*/
+/*--- LZ4 Configuration for Valgrind                       ---*/
+/*------------------------------------------------------------*/
+
+/* Disable memory allocation functions (we provide them below) */
+#define LZ4_USER_MEMORY_FUNCTIONS 1
+
+/* Freestanding mode - no string.h */
+#define LZ4_FREESTANDING 1
+
+/* Provide size_t */
+#ifndef size_t
+#define size_t SizeT
+#endif
+
+/* Provide INT_MAX from limits.h */
+#ifndef INT_MAX
+#define INT_MAX 2147483647
+#endif
+
+#ifndef UINT_MAX
+#define UINT_MAX 4294967295U
+#endif
+
+/*------------------------------------------------------------*/
+/*--- Memory function replacements                         ---*/
+/*------------------------------------------------------------*/
+
+/* Define LZ4_memcpy, LZ4_memmove, LZ4_memset before including lz4 */
+#define LZ4_memcpy(dst, src, size)  VG_(memcpy)((dst), (src), (size))
+#define LZ4_memmove(dst, src, size) VG_(memmove)((dst), (src), (size))
+#define LZ4_memset(p, v, s)         VG_(memset)((p), (v), (s))
+
+/*------------------------------------------------------------*/
+/*--- Memory allocation functions (LZ4_USER_MEMORY_FUNCTIONS) */
+/*------------------------------------------------------------*/
+
+void* LZ4_malloc(size_t s)
+{
+    return VG_(malloc)("tg.lz4", s);
+}
+
+void* LZ4_calloc(size_t n, size_t s)
+{
+    return VG_(calloc)("tg.lz4", n, s);
+}
+
+void LZ4_free(void* p)
+{
+    if (p) VG_(free)(p);
+}
+
+/*------------------------------------------------------------*/
+/*--- Include the original LZ4 implementation              ---*/
+/*------------------------------------------------------------*/
+
+/* Disable assert (LZ4 has its own fallback) */
+#define LZ4_DEBUG 0
+
+/* Include the main LZ4 source */
+#include "lz4.c"
+
+/*------------------------------------------------------------*/
+/*--- Wrapper API                                          ---*/
+/*------------------------------------------------------------*/
+
+SizeT tg_lz4_compress_bound(SizeT src_size)
+{
+    return LZ4_compressBound((int)src_size);
+}
+
+SizeT tg_lz4_compress(void* dst, SizeT dst_capacity,
+                      const void* src, SizeT src_size)
+{
+    int result = LZ4_compress_default((const char*)src, (char*)dst,
+                                      (int)src_size, (int)dst_capacity);
+    if (result <= 0) {
+        return 0;
+    }
+    return (SizeT)result;
+}
diff --git a/tracegrind/tg_lz4.h b/tracegrind/tg_lz4.h
new file mode 100644
index 000000000..63a427501
--- /dev/null
+++ b/tracegrind/tg_lz4.h
@@ -0,0 +1,21 @@
+/*
+ * LZ4 compression wrapper for Tracegrind.
+ * Uses vendored LZ4 library adapted for Valgrind (no libc).
+ */
+
+#ifndef TG_LZ4_H
+#define TG_LZ4_H
+
+#include "pub_tool_basics.h"
+
+/* Return the maximum compressed size for a given source length */
+SizeT tg_lz4_compress_bound(SizeT src_size);
+
+/* Compress src[0..src_size-1] into dst.
+ * dst_capacity must be >= tg_lz4_compress_bound(src_size).
+ * Returns the compressed size on success, 0 on error.
+ */
+SizeT tg_lz4_compress(void* dst, SizeT dst_capacity,
+                      const void* src, SizeT src_size);
+
+#endif /* TG_LZ4_H */
diff --git a/tracegrind/tg_msgpack.c b/tracegrind/tg_msgpack.c
new file mode 100644
index 000000000..da8911307
--- /dev/null
+++ b/tracegrind/tg_msgpack.c
@@ -0,0 +1,214 @@
+/*
+ * Minimal MsgPack encoder for Tracegrind.
+ * Write-only, adapted for Valgrind (no libc).
+ *
+ * MsgPack format spec: https://github.com/msgpack/msgpack/blob/master/spec.md
+ */
+
+#include "pub_tool_basics.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_mallocfree.h"
+
+#include "tg_msgpack.h"
+
+/* Ensure at least `needed` bytes of capacity */
+static void msgpack_ensure(msgpack_buffer* mb, Int needed)
+{
+    if (mb->size + needed <= mb->capacity)
+        return;
+    Int new_cap = mb->capacity * 2;
+    if (new_cap < mb->size + needed)
+        new_cap = mb->size + needed;
+    mb->data = VG_(realloc)("tg.msgpack.buf", mb->data, new_cap);
+    mb->capacity = new_cap;
+}
+
+static void write_byte(msgpack_buffer* mb, UChar b)
+{
+    msgpack_ensure(mb, 1);
+    mb->data[mb->size++] = b;
+}
+
+static void write_bytes(msgpack_buffer* mb, const void* data, Int len)
+{
+    msgpack_ensure(mb, len);
+    VG_(memcpy)(mb->data + mb->size, data, len);
+    mb->size += len;
+}
+
+/* Write big-endian integers */
+static void write_be16(msgpack_buffer* mb, UShort val)
+{
+    UChar buf[2];
+    buf[0] = (UChar)(val >> 8);
+    buf[1] = (UChar)(val);
+    write_bytes(mb, buf, 2);
+}
+
+static void write_be32(msgpack_buffer* mb, UInt val)
+{
+    UChar buf[4];
+    buf[0] = (UChar)(val >> 24);
+    buf[1] = (UChar)(val >> 16);
+    buf[2] = (UChar)(val >> 8);
+    buf[3] = (UChar)(val);
+    write_bytes(mb, buf, 4);
+}
+
+static void write_be64(msgpack_buffer* mb, ULong val)
+{
+    UChar buf[8];
+    buf[0] = (UChar)(val >> 56);
+    buf[1] = (UChar)(val >> 48);
+    buf[2] = (UChar)(val >> 40);
+    buf[3] = (UChar)(val >> 32);
+    buf[4] = (UChar)(val >> 24);
+    buf[5] = (UChar)(val >> 16);
+    buf[6] = (UChar)(val >> 8);
+    buf[7] = (UChar)(val);
+    write_bytes(mb, buf, 8);
+}
+
+void msgpack_init(msgpack_buffer* mb, Int capacity)
+{
+    if (capacity < 256) capacity = 256;
+    mb->data = VG_(malloc)("tg.msgpack.init", capacity);
+    mb->size = 0;
+    mb->capacity = capacity;
+}
+
+void msgpack_free(msgpack_buffer* mb)
+{
+    if (mb->data) {
+        VG_(free)(mb->data);
+        mb->data = NULL;
+    }
+    mb->size = 0;
+    mb->capacity = 0;
+}
+
+void msgpack_reset(msgpack_buffer* mb)
+{
+    mb->size = 0;
+}
+
+void msgpack_write_nil(msgpack_buffer* mb)
+{
+    write_byte(mb, 0xc0);
+}
+
+void msgpack_write_bool(msgpack_buffer* mb, Bool val)
+{
+    write_byte(mb, val ? 0xc3 : 0xc2);
+}
+
+void msgpack_write_int(msgpack_buffer* mb, Long val)
+{
+    if (val >= 0) {
+        msgpack_write_uint(mb, (ULong)val);
+    } else if (val >= -32) {
+        /* negative fixint: 111xxxxx */
+        write_byte(mb, (UChar)(val & 0xff));
+    } else if (val >= -128) {
+        write_byte(mb, 0xd0); /* int8 */
+        write_byte(mb, (UChar)(val & 0xff));
+    } else if (val >= -32768) {
+        write_byte(mb, 0xd1); /* int16 */
+        write_be16(mb, (UShort)(val & 0xffff));
+    } else if (val >= -2147483648LL) {
+        write_byte(mb, 0xd2); /* int32 */
+        write_be32(mb, (UInt)(val & 0xffffffff));
+    } else {
+        write_byte(mb, 0xd3); /* int64 */
+        write_be64(mb, (ULong)val);
+    }
+}
+
+void msgpack_write_uint(msgpack_buffer* mb, ULong val)
+{
+    if (val <= 0x7f) {
+        /* positive fixint: 0xxxxxxx */
+        write_byte(mb, (UChar)val);
+    } else if (val <= 0xff) {
+        write_byte(mb, 0xcc); /* uint8 */
+        write_byte(mb, (UChar)val);
+    } else if (val <= 0xffff) {
+        write_byte(mb, 0xcd); /* uint16 */
+        write_be16(mb, (UShort)val);
+    } else if (val <= 0xffffffff) {
+        write_byte(mb, 0xce); /* uint32 */
+        write_be32(mb, (UInt)val);
+    } else {
+        write_byte(mb, 0xcf); /* uint64 */
+        write_be64(mb, val);
+    }
+}
+
+void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len)
+{
+    if (len < 0) len = VG_(strlen)(str);
+
+    if (len <= 31) {
+        /* fixstr: 101xxxxx */
+        write_byte(mb, (UChar)(0xa0 | len));
+    } else if (len <= 0xff) {
+        write_byte(mb, 0xd9); /* str8 */
+        write_byte(mb, (UChar)len);
+    } else if (len <= 0xffff) {
+        write_byte(mb, 0xda); /* str16 */
+        write_be16(mb, (UShort)len);
+    } else {
+        write_byte(mb, 0xdb); /* str32 */
+        write_be32(mb, (UInt)len);
+    }
+    write_bytes(mb, str, len);
+}
+
+void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len)
+{
+    if (len <= 0xff) {
+        write_byte(mb, 0xc4); /* bin8 */
+        write_byte(mb, (UChar)len);
+    } else if (len <= 0xffff) {
+        write_byte(mb, 0xc5); /* bin16 */
+        write_be16(mb, (UShort)len);
+    } else {
+        write_byte(mb, 0xc6); /* bin32 */
+        write_be32(mb, (UInt)len);
+    }
+    write_bytes(mb, data, len);
+}
+
+void msgpack_write_array_header(msgpack_buffer* mb, UInt count)
+{
+    if (count <= 15) {
+        /* fixarray: 1001xxxx */
+        write_byte(mb, (UChar)(0x90 | count));
+    } else if (count <= 0xffff) {
+        write_byte(mb, 0xdc); /* array16 */
+        write_be16(mb, (UShort)count);
+    } else {
+        write_byte(mb, 0xdd); /* array32 */
+        write_be32(mb, count);
+    }
+}
+
+void msgpack_write_map_header(msgpack_buffer* mb, UInt count)
+{
+    if (count <= 15) {
+        /* fixmap: 1000xxxx */
+        write_byte(mb, (UChar)(0x80 | count));
+    } else if (count <= 0xffff) {
+        write_byte(mb, 0xde); /* map16 */
+        write_be16(mb, (UShort)count);
+    } else {
+        write_byte(mb, 0xdf); /* map32 */
+        write_be32(mb, count);
+    }
+}
+
+void msgpack_write_key(msgpack_buffer* mb, const HChar* key)
+{
+    msgpack_write_str(mb, key, -1);
+}
diff --git a/tracegrind/tg_msgpack.h b/tracegrind/tg_msgpack.h
new file mode 100644
index 000000000..e04d317bc
--- /dev/null
+++ b/tracegrind/tg_msgpack.h
@@ -0,0 +1,36 @@
+/*
+ * Minimal MsgPack encoder for Tracegrind.
+ * Write-only, adapted for Valgrind (no libc).
+ */
+
+#ifndef TG_MSGPACK_H
+#define TG_MSGPACK_H
+
+#include "pub_tool_basics.h"
+
+typedef struct {
+    UChar* data;
+    Int    size;
+    Int    capacity;
+} msgpack_buffer;
+
+void msgpack_init(msgpack_buffer* mb, Int capacity);
+void msgpack_free(msgpack_buffer* mb);
+void msgpack_reset(msgpack_buffer* mb);
+
+/* Encode primitives */
+void msgpack_write_nil(msgpack_buffer* mb);
+void msgpack_write_bool(msgpack_buffer* mb, Bool val);
+void msgpack_write_int(msgpack_buffer* mb, Long val);
+void msgpack_write_uint(msgpack_buffer* mb, ULong val);
+void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len);
+void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len);
+
+/* Containers */
+void msgpack_write_array_header(msgpack_buffer* mb, UInt count);
+void msgpack_write_map_header(msgpack_buffer* mb, UInt count);
+
+/* Convenience: write a string key (for maps) */
+void msgpack_write_key(msgpack_buffer* mb, const HChar* key);
+
+#endif /* TG_MSGPACK_H */

From 864a751da10825560802dee2817a24f2c4679335 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 00:20:15 +0000
Subject: [PATCH 04/26] feat: discriminated union schema for msgpack, drop CSV
 format

- Update msgpack format to version 2 with event_schemas
- Each event type (ENTER, EXIT, FORK) has its own column schema
- FORK events use minimal 4-element format: [seq, tid, event, child_pid]
- Remove CSV output format entirely (msgpack-only now)
- Add decode-trace.py script for debugging trace files
- Add fork detection via post-syscall handler for fork/clone/vfork

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tracegrind/clo.c                             |   5 +-
 tracegrind/docs/tracegrind-msgpack-format.md |  59 +++-
 tracegrind/dump.c                            | 245 ++++++----------
 tracegrind/global.h                          |  19 +-
 tracegrind/main.c                            |  63 ++++-
 tracegrind/scripts/decode-trace.py           | 277 +++++++++++++++++++
 6 files changed, 480 insertions(+), 188 deletions(-)
 create mode 100755 tracegrind/scripts/decode-trace.py

diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index 50e4800e4..5657d7d10 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -518,8 +518,6 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 
    else if VG_STR_CLO(arg, "--tracegrind-out-file", TG_(clo).out_format) {}
 
-   else if VG_XACT_CLO(arg, "--output-format=csv",
-                       TG_(clo).output_format, output_format_csv) {}
    else if VG_XACT_CLO(arg, "--output-format=msgpack",
                        TG_(clo).output_format, output_format_msgpack) {}
 
@@ -578,7 +576,6 @@ void TG_(print_usage)(void)
    VG_(printf)(
 "\n   dump creation options:\n"
 "    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p]\n"
-"    --output-format=csv|msgpack  Output format [csv]\n"
 "    --dump-line=no|yes        Dump source lines of costs? [yes]\n"
 "    --dump-instr=no|yes       Dump instruction address of costs? [no]\n"
 "    --compress-strings=no|yes Compress strings in profile dump? [yes]\n"
@@ -704,5 +701,5 @@ void TG_(set_clo_defaults)(void)
   TG_(clo).verbose_start = 0;
 #endif
 
-  TG_(clo).output_format = output_format_csv;
+  TG_(clo).output_format = output_format_msgpack;
 }
diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index 12b4c4658..b2f6d6e94 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-Tracegrind's `--output-format=msgpack` produces a binary trace file combining MsgPack serialization with LZ4 block compression. Files use the `.msgpack.lz4` extension.
+Tracegrind produces a binary trace file combining MsgPack serialization with LZ4 block compression. Files use the `.msgpack.lz4` extension.
 
 ## File Structure
 
@@ -23,7 +23,7 @@ Tracegrind's `--output-format=msgpack` produces a binary trace file combining Ms
 | Offset | Size | Field   | Description |
 |--------|------|---------|-------------|
 | 0      | 4    | magic   | ASCII `TGMP` (0x54 0x47 0x4D 0x50) |
-| 4      | 4    | version | Format version, uint32 LE (currently 1) |
+| 4      | 4    | version | Format version, uint32 LE (currently 2) |
 
 ## Chunk Format
 
@@ -37,17 +37,31 @@ Each chunk (schema and data) has the same header:
 
 ## Schema Chunk
 
-The first chunk contains a MsgPack map:
+The first chunk contains a MsgPack map describing the discriminated union schema:
 
 ```json
 {
-    "version": 1,
+    "version": 2,
     "format": "tracegrind-msgpack",
-    "columns": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...]
+    "event_schemas": {
+        "0": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "2": ["seq", "tid", "event", "child_pid"]
+    }
 }
 ```
 
-### Fixed Columns
+### Event Types
+
+| Type | Name  | Description |
+|------|-------|-------------|
+| 0    | ENTER | Function entry |
+| 1    | EXIT  | Function exit |
+| 2    | FORK  | Child process created |
+
+### Row Schemas
+
+**ENTER/EXIT rows (event 0, 1):**
 
 | Index | Name  | Type   | Description |
 |-------|-------|--------|-------------|
@@ -58,17 +72,31 @@ The first chunk contains a MsgPack map:
 | 4     | obj   | string | Shared object path |
 | 5     | file  | string | Source file path |
 | 6     | line  | int32  | Line number (0 if unknown) |
+| 7+    | ...   | int64  | Event counter deltas (Ir, Dr, Dw, etc.) |
+
+**FORK rows (event 2):**
+
+| Index | Name      | Type   | Description |
+|-------|-----------|--------|-------------|
+| 0     | seq       | uint64 | Sequence number |
+| 1     | tid       | int32  | Thread ID that called fork |
+| 2     | event     | int    | 2 = FORK |
+| 3     | child_pid | int32  | PID of the new child process |
+
+### Event Counter Columns
 
-### Event Columns (index 7+)
+For ENTER/EXIT rows, event counters appear as delta values starting at index 7. Which counters are present depends on Tracegrind options:
 
-Event counters as delta values: `Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`. Which columns are present depends on Tracegrind options.
+`Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`
 
 ## Data Chunks
 
-Each data chunk contains concatenated MsgPack arrays (one per row):
+Each data chunk contains concatenated MsgPack arrays. The row format depends on the event type (index 2):
 
 ```
-[seq, tid, event, fn, obj, file, line, delta_Ir, ...]
+[seq, tid, 0, fn, obj, file, line, delta_Ir, ...]  # ENTER
+[seq, tid, 1, fn, obj, file, line, delta_Ir, ...]  # EXIT
+[seq, tid, 2, child_pid]                            # FORK
 ```
 
 The reference implementation writes 4096 rows per chunk.
@@ -86,13 +114,16 @@ def read_tracegrind(filepath):
     with open(filepath, 'rb') as f:
         assert f.read(4) == b'TGMP'
         version = struct.unpack('<I', f.read(4))[0]
+        assert version == 2
 
         # Read schema chunk
         usize, csize = struct.unpack('<II', f.read(8))
         schema = msgpack.unpackb(
             lz4.block.decompress(f.read(csize), uncompressed_size=usize))
-        columns = [c.decode() if isinstance(c, bytes) else c
-                   for c in schema[b'columns']]
+        event_schemas = {
+            int(k): [c.decode() if isinstance(c, bytes) else c for c in v]
+            for k, v in schema[b'event_schemas'].items()
+        }
 
         # Read data chunks
         rows = []
@@ -104,9 +135,11 @@ def read_tracegrind(filepath):
             unpacker = msgpack.Unpacker(raw=False)
             unpacker.feed(chunk)
             for row in unpacker:
+                event_type = row[2]
+                columns = event_schemas[event_type]
                 rows.append(dict(zip(columns, row)))
 
-        return columns, rows
+        return event_schemas, rows
 ```
 
 ## References
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index b01834e88..871b31aec 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -51,7 +51,7 @@ Int TG_(get_dump_counter)(void)
 }
 
 /* ================================================================== */
-/* === CSV trace output                                           === */
+/* === Trace output                                                === */
 /* ================================================================== */
 
 trace_output TG_(trace_out) = { .fd = -1, .seq = 0,
@@ -118,31 +118,50 @@ static void msgpack_flush_chunk(void)
     mp_state.rows_in_chunk = 0;
 }
 
-/* Write file header with schema metadata */
+/* Write file header with schema metadata (discriminated union format) */
 static void msgpack_write_header(void)
 {
     msgpack_buffer hdr;
-    msgpack_init(&hdr, 1024);
+    msgpack_init(&hdr, 2048);
 
     /* Header is a map with metadata */
     msgpack_write_map_header(&hdr, 3);
 
     /* version */
     msgpack_write_key(&hdr, "version");
-    msgpack_write_uint(&hdr, 1);
+    msgpack_write_uint(&hdr, 2);
 
     /* format */
     msgpack_write_key(&hdr, "format");
     msgpack_write_str(&hdr, "tracegrind-msgpack", -1);
 
-    /* columns */
-    msgpack_write_key(&hdr, "columns");
+    /* event_schemas - discriminated union: each event type has its own schema */
+    msgpack_write_key(&hdr, "event_schemas");
+    msgpack_write_map_header(&hdr, 3);  /* 3 event types: ENTER, EXIT, FORK */
+
+    /* Event type 0 (ENTER) schema */
+    msgpack_write_key(&hdr, "0");
+    msgpack_write_array_header(&hdr, mp_state.ncols);
+    for (Int i = 0; i < mp_state.ncols; i++) {
+        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+    }
+
+    /* Event type 1 (EXIT) schema - same as ENTER */
+    msgpack_write_key(&hdr, "1");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
-    /* Compress and write header chunk (with special marker) */
+    /* Event type 2 (FORK) schema - minimal: seq, tid, event, child_pid */
+    msgpack_write_key(&hdr, "2");
+    msgpack_write_array_header(&hdr, 4);
+    msgpack_write_str(&hdr, "seq", -1);
+    msgpack_write_str(&hdr, "tid", -1);
+    msgpack_write_str(&hdr, "event", -1);
+    msgpack_write_str(&hdr, "child_pid", -1);
+
+    /* Compress and write header chunk */
     SizeT src_size = hdr.size;
     SizeT dst_capacity = tg_lz4_compress_bound(src_size);
     UChar* compressed = VG_(malloc)("tg.mp.hdr", dst_capacity);
@@ -150,8 +169,8 @@ static void msgpack_write_header(void)
     SizeT compressed_size = tg_lz4_compress(
         compressed, dst_capacity, hdr.data, src_size);
 
-    /* Magic + version (8 bytes): "TGMP" + version(4) */
-    UChar magic[8] = {'T', 'G', 'M', 'P', 0x01, 0x00, 0x00, 0x00};
+    /* Magic + version (8 bytes): "TGMP" + version(4) - version 2 */
+    UChar magic[8] = {'T', 'G', 'M', 'P', 0x02, 0x00, 0x00, 0x00};
     VG_(write)(TG_(trace_out).fd, magic, 8);
 
     /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
@@ -223,7 +242,7 @@ static void msgpack_init_state(void)
     msgpack_write_header();
 }
 
-/* Add a row to the msgpack output */
+/* Add an ENTER/EXIT row to the msgpack output */
 static void msgpack_add_row(ULong seq, Int tid, Int event,
                             const HChar* fn_name, const HChar* obj_name,
                             const HChar* file_name, Int line,
@@ -254,6 +273,24 @@ static void msgpack_add_row(ULong seq, Int tid, Int event,
     }
 }
 
+/* Add a FORK row to the msgpack output (minimal schema: seq, tid, event, child_pid) */
+static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid)
+{
+    /* FORK row is a 4-element array */
+    msgpack_write_array_header(&mp_state.buf, 4);
+    msgpack_write_uint(&mp_state.buf, seq);
+    msgpack_write_int(&mp_state.buf, tid);
+    msgpack_write_int(&mp_state.buf, TG_EV_FORK);  /* 2 = FORK */
+    msgpack_write_int(&mp_state.buf, child_pid);
+
+    mp_state.rows_in_chunk++;
+
+    /* Flush if chunk is full */
+    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+        msgpack_flush_chunk();
+    }
+}
+
 /* Close msgpack output */
 static void msgpack_close_output(void)
 {
@@ -272,52 +309,6 @@ static void msgpack_close_output(void)
     }
 }
 
-/* Write a string to the trace output fd */
-static void trace_write(const HChar* buf, Int len)
-{
-    if (TG_(trace_out).fd < 0) return;
-    VG_(write)(TG_(trace_out).fd, buf, len);
-}
-
-/* Escape a string for CSV: if it contains comma, quote, or newline,
- * wrap in quotes and double any quotes.  Otherwise just copy.
- * Writes to buf, returns chars written. buf must be large enough.
- */
-static Int csv_escape(HChar* buf, Int bufsize, const HChar* src)
-{
-    Bool needs_quote = False;
-    const HChar* p;
-    Int i;
-
-    for (p = src; *p; p++) {
-        if (*p == ',' || *p == '"' || *p == '\n') {
-            needs_quote = True;
-            break;
-        }
-    }
-
-    if (!needs_quote) {
-        i = 0;
-        for (p = src; *p && i < bufsize - 1; p++, i++)
-            buf[i] = *p;
-        buf[i] = '\0';
-        return i;
-    }
-
-    i = 0;
-    if (i < bufsize - 1) buf[i++] = '"';
-    for (p = src; *p && i < bufsize - 2; p++) {
-        if (*p == '"' && i < bufsize - 3) {
-            buf[i++] = '"';
-            buf[i++] = '"';
-        } else {
-            buf[i++] = *p;
-        }
-    }
-    if (i < bufsize - 1) buf[i++] = '"';
-    buf[i] = '\0';
-    return i;
-}
 
 void TG_(trace_open_output)(void)
 {
@@ -335,13 +326,11 @@ void TG_(trace_open_output)(void)
     filename[sizeof(filename) - 1] = '\0';
     VG_(free)(expanded);
 
-    /* Append format-specific suffix */
-    if (TG_(clo).output_format == output_format_msgpack) {
-        SizeT len = VG_(strlen)(filename);
-        if (len + 12 < sizeof(filename)) {
-            VG_(strncpy)(filename + len, ".msgpack.lz4", sizeof(filename) - len - 1);
-            filename[sizeof(filename) - 1] = '\0';
-        }
+    /* Append .msgpack.lz4 suffix */
+    SizeT len = VG_(strlen)(filename);
+    if (len + 12 < sizeof(filename)) {
+        VG_(strncpy)(filename + len, ".msgpack.lz4", sizeof(filename) - len - 1);
+        filename[sizeof(filename) - 1] = '\0';
     }
 
     res = VG_(open)(filename,
@@ -358,42 +347,33 @@ void TG_(trace_open_output)(void)
     TG_(trace_out).initialized = True;
     TG_(trace_out).header_written = False;
 
-    /* Initialize format-specific writer */
-    if (TG_(clo).output_format == output_format_msgpack) {
-        msgpack_init_state();
-    }
+    /* Initialize msgpack writer */
+    msgpack_init_state();
 
     if (VG_(clo_verbosity) > 1)
         VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename);
 }
 
-/* Write the CSV header row.
- * Called lazily on first sample emission so that event sets are fully configured.
+/*
+ * Called in child process after fork.
+ * Closes the inherited file descriptor (without writing end marker)
+ * and opens a new trace file with the child's PID.
  */
-static void trace_write_header(void)
+void TG_(trace_reopen_child)(void)
 {
-    HChar buf[4096];
-    Int pos = 0;
-
-    if (TG_(trace_out).header_written) return;
-    TG_(trace_out).header_written = True;
-
-    pos += VG_(sprintf)(buf + pos, "seq,tid,event,fn,obj,file,line");
-
-    /* Emit column names for all events in the full event set */
-    EventSet* es = TG_(sets).full;
-    Int g, i;
-    for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
-        if (!(es->mask & (1u << g))) continue;
-        EventGroup* eg = TG_(get_event_group)(g);
-        if (!eg) continue;
-        for (i = 0; i < eg->size; i++) {
-            pos += VG_(sprintf)(buf + pos, ",%s", eg->name[i]);
-        }
+    /* Close inherited fd without flushing/finalizing (that's parent's job) */
+    if (TG_(trace_out).fd >= 0) {
+        VG_(close)(TG_(trace_out).fd);
     }
 
-    pos += VG_(sprintf)(buf + pos, "\n");
-    trace_write(buf, pos);
+    /* Reset state completely */
+    TG_(trace_out).fd = -1;
+    TG_(trace_out).seq = 0;
+    TG_(trace_out).initialized = False;
+    TG_(trace_out).header_written = False;
+
+    /* Open new trace file with child's PID (also re-inits msgpack state) */
+    TG_(trace_open_output)();
 }
 
 void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
@@ -441,53 +421,27 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
     }
 
     /* Event type: 0=ENTER, 1=EXIT */
-    Int event_val = is_enter ? 0 : 1;
-    const HChar* event_str = is_enter ? "ENTER" : "EXIT";
-
-    if (TG_(clo).output_format == output_format_msgpack) {
-        /* --- MsgPack + LZ4 path --- */
-        msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val,
-                        fn_name, obj_name, file_name, (Int)line,
-                        deltas, es->size);
-    } else {
-        /* --- CSV path --- */
-        HChar buf[4096];
-        HChar escaped[1024];
-        Int pos = 0;
-
-        /* Lazily write header on first sample */
-        if (!TG_(trace_out).header_written)
-            trace_write_header();
+    Int event_val = is_enter ? TG_EV_ENTER : TG_EV_EXIT;
 
-        /* seq, tid, event */
-        pos += VG_(sprintf)(buf + pos, "%llu,%u,%s,",
-                            TG_(trace_out).seq,
-                            (UInt)tid,
-                            event_str);
-
-        /* fn (escaped) */
-        csv_escape(escaped, sizeof(escaped), fn_name);
-        pos += VG_(sprintf)(buf + pos, "%s,", escaped);
-
-        /* obj (escaped) */
-        csv_escape(escaped, sizeof(escaped), obj_name);
-        pos += VG_(sprintf)(buf + pos, "%s,", escaped);
-
-        /* file (escaped) */
-        csv_escape(escaped, sizeof(escaped), file_name);
-        pos += VG_(sprintf)(buf + pos, "%s,", escaped);
+    msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val,
+                    fn_name, obj_name, file_name, (Int)line,
+                    deltas, es->size);
+}
 
-        /* line */
-        pos += VG_(sprintf)(buf + pos, "%u", line);
+/*
+ * Emit a FORK event when a child process is created.
+ * Called from the post-syscall handler when fork/clone returns in parent.
+ * child_pid is the PID of the newly created child process.
+ */
+void TG_(trace_emit_fork)(ThreadId tid, Int child_pid)
+{
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
 
-        /* event deltas */
-        for (i = 0; i < es->size; i++) {
-            pos += VG_(sprintf)(buf + pos, ",%llu", deltas[i]);
-        }
+    TG_(trace_out).seq++;
 
-        pos += VG_(sprintf)(buf + pos, "\n");
-        trace_write(buf, pos);
-    }
+    /* FORK uses minimal schema: [seq, tid, event, child_pid] */
+    msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid);
 }
 
 void TG_(trace_close_output)(void)
@@ -495,28 +449,9 @@ void TG_(trace_close_output)(void)
     if (!TG_(trace_out).initialized) return;
     if (TG_(trace_out).fd < 0) return;
 
-    if (TG_(clo).output_format == output_format_msgpack) {
-        /* MsgPack close flushes remaining rows, writes end marker, closes fd */
-        msgpack_close_output();
-        VG_(close)(TG_(trace_out).fd);
-    } else {
-        /* Write a totals summary comment at the end for verification */
-        if (TG_(total_cost)) {
-            HChar buf[4096];
-            Int pos = 0;
-            Int i;
-            EventSet* es = TG_(sets).full;
-
-            pos += VG_(sprintf)(buf + pos, "# totals:");
-            for (i = 0; i < es->size; i++) {
-                pos += VG_(sprintf)(buf + pos, " %llu", TG_(total_cost)[i]);
-            }
-            pos += VG_(sprintf)(buf + pos, "\n");
-            trace_write(buf, pos);
-        }
-
-        VG_(close)(TG_(trace_out).fd);
-    }
+    /* Flush remaining rows, write end marker */
+    msgpack_close_output();
+    VG_(close)(TG_(trace_out).fd);
 
     TG_(trace_out).fd = -1;
     TG_(trace_out).initialized = False;
diff --git a/tracegrind/global.h b/tracegrind/global.h
index dd9afd561..a9c8649d2 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -30,6 +30,7 @@
 
 #include "pub_tool_basics.h"
 #include "pub_tool_vki.h"
+#include "pub_tool_vkiscnums.h"
 #include "pub_tool_debuginfo.h"
 #include "pub_tool_libcbase.h"
 #include "pub_tool_libcassert.h"
@@ -79,10 +80,16 @@ typedef enum {
 } Collect_Systime;
 
 typedef enum {
-   output_format_csv,
-   output_format_msgpack
+   output_format_msgpack = 0
 } OutputFormat;
 
+/* Trace event types */
+typedef enum {
+   TG_EV_ENTER = 0,
+   TG_EV_EXIT  = 1,
+   TG_EV_FORK  = 2
+} TraceEventType;
+
 typedef struct _CommandLineOptions CommandLineOptions;
 struct _CommandLineOptions {
 
@@ -687,14 +694,14 @@ struct event_sets {
 
 
 /*------------------------------------------------------------*/
-/*--- CSV trace output state                               ---*/
+/*--- Trace output state                                   ---*/
 /*------------------------------------------------------------*/
 
 typedef struct {
     Int       fd;              /* Output file descriptor (-1 if not open) */
     ULong     seq;             /* Global sequence counter */
     Bool      initialized;     /* Has the output been opened? */
-    Bool      header_written;  /* Has the CSV header been written? */
+    Bool      header_written;  /* Has the schema chunk been written? */
 } trace_output;
 
 
@@ -805,9 +812,11 @@ void TG_(run_post_signal_on_call_stack_bottom)(void);
 /* from dump.c */
 void TG_(init_dumps)(void);
 
-/* CSV trace output (from dump.c) */
+/* Trace output (from dump.c) */
 void TG_(trace_open_output)(void);
+void TG_(trace_reopen_child)(void);
 void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
+void TG_(trace_emit_fork)(ThreadId tid, Int child_pid);
 void TG_(trace_close_output)(void);
 
 /*------------------------------------------------------------*/
diff --git a/tracegrind/main.c b/tracegrind/main.c
index 33c204b54..8b5e62ecb 100644
--- a/tracegrind/main.c
+++ b/tracegrind/main.c
@@ -1742,11 +1742,14 @@ void collect_time (struct vki_timespec *systime, struct vki_timespec *syscputime
 }
 
 static
-void TG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
-                           UWord* args, UInt nArgs)
+void TG_(pre_syscall)(ThreadId tid, UInt syscallno,
+                      UWord* args, UInt nArgs)
 {
-  collect_time(&syscalltime[tid],
-               TG_(clo).collect_systime == systime_nsec ? &syscallcputime[tid] : NULL);
+  /* Collect time for systime tracking if enabled */
+  if (TG_(clo).collect_systime != systime_no) {
+    collect_time(&syscalltime[tid],
+                 TG_(clo).collect_systime == systime_nsec ? &syscallcputime[tid] : NULL);
+  }
 }
 
 /* Returns "after - before" in the unit as specified by --collect-systime.
@@ -1772,11 +1775,35 @@ ULong vki_timespec_diff (struct vki_timespec after, struct vki_timespec before)
   return ((ULong) diff_sec * 1000000000ULL + diff_nsec) / nsec_factor;
 }
 
+/* Check if syscall is a fork-like call that creates a new process */
+static Bool is_fork_syscall(UInt syscallno)
+{
+#if defined(VGO_linux)
+   return syscallno == __NR_clone
+       || syscallno == __NR_fork
+       || syscallno == __NR_vfork
+#  if defined(__NR_clone3)
+       || syscallno == __NR_clone3
+#  endif
+       ;
+#else
+   return False;  /* TODO: support other OSes */
+#endif
+}
+
 static
-void TG_(post_syscalltime)(ThreadId tid, UInt syscallno,
-                            UWord* args, UInt nArgs, SysRes res)
+void TG_(post_syscall)(ThreadId tid, UInt syscallno,
+                       UWord* args, UInt nArgs, SysRes res)
 {
-  if (TG_(current_state).bbcc) {
+  /* Handle fork/clone: emit FORK event with child PID */
+  if (is_fork_syscall(syscallno) && !sr_isError(res) && sr_Res(res) > 0) {
+    /* We're in the parent, sr_Res(res) is the child PID */
+    Int child_pid = (Int)sr_Res(res);
+    TG_(trace_emit_fork)(tid, child_pid);
+  }
+
+  /* Handle systime collection if enabled */
+  if (TG_(clo).collect_systime != systime_no && TG_(current_state).bbcc) {
     Int o;
     struct vki_timespec ts_now;
     struct vki_timespec ts_cpunow;
@@ -2020,6 +2047,15 @@ static void tg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
    TG_(run_thread)( tid );
 }
 
+/*
+ * Called after fork() in the child process.
+ * Reopens the trace file with the child's PID.
+ */
+static void tg_atfork_child(ThreadId tid)
+{
+   TG_(trace_reopen_child)();
+}
+
 static
 void TG_(post_clo_init)(void)
 {
@@ -2033,9 +2069,11 @@ void TG_(post_clo_init)(void)
                 "sp-at-mem-access\n");
    }
 
+   /* Always register syscall wrappers for fork/clone detection.
+      Also handles systime collection if enabled. */
+   VG_(needs_syscall_wrapper)(TG_(pre_syscall), TG_(post_syscall));
+
    if (TG_(clo).collect_systime != systime_no) {
-      VG_(needs_syscall_wrapper)(TG_(pre_syscalltime),
-                                 TG_(post_syscalltime));
       syscalltime = TG_MALLOC("cl.main.pci.1",
                                VG_N_THREADS * sizeof syscalltime[0]);
       for (UInt i = 0; i < VG_N_THREADS; ++i) {
@@ -2101,12 +2139,15 @@ void TG_(post_clo_init)(void)
 
    TG_(instrument_state) = TG_(clo).instrument_atstart;
 
-   /* Open CSV trace output file */
+   /* Open trace output file */
    TG_(trace_open_output)();
 
+   /* Register fork handler to emit FORK events */
+   VG_(atfork)(NULL, NULL, tg_atfork_child);
+
    if (VG_(clo_verbosity) > 0) {
       VG_(message)(Vg_UserMsg,
-                   "Streaming CSV trace output to tracegrind.out.%d\n",
+                   "Streaming trace output to tracegrind.out.%d\n",
                    VG_(getpid)());
    }
 }
diff --git a/tracegrind/scripts/decode-trace.py b/tracegrind/scripts/decode-trace.py
new file mode 100755
index 000000000..2b3fb4c31
--- /dev/null
+++ b/tracegrind/scripts/decode-trace.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env -S uvx --with lz4 --with msgpack python3
+# /// script
+# requires-python = ">=3.8"
+# dependencies = ["lz4", "msgpack"]
+# ///
+"""
+Decode and debug tracegrind MsgPack+LZ4 trace files.
+
+Usage:
+    ./decode-trace.py <trace.msgpack.lz4> [options]
+
+Examples:
+    ./decode-trace.py tracegrind.out.12345.msgpack.lz4
+    ./decode-trace.py trace.msgpack.lz4 --head 20
+    ./decode-trace.py trace.msgpack.lz4 --schema
+    ./decode-trace.py trace.msgpack.lz4 --stats
+    ./decode-trace.py trace.msgpack.lz4 --json
+"""
+
+import argparse
+import json
+import struct
+import sys
+from collections import Counter
+from typing import Any, BinaryIO, Dict, Iterator, List, Tuple
+
+import lz4.block
+import msgpack
+
+
+MAGIC = b'TGMP'
+
+
+def read_header(f: BinaryIO) -> int:
+    """Read and validate file header, return version."""
+    magic = f.read(4)
+    if magic != MAGIC:
+        raise ValueError(f"Invalid magic: {magic!r}, expected {MAGIC!r}")
+    version = struct.unpack('<I', f.read(4))[0]
+    return version
+
+
+def read_chunk(f: BinaryIO) -> bytes | None:
+    """Read a single chunk, return decompressed data or None for end marker."""
+    header = f.read(8)
+    if len(header) < 8:
+        return None
+    usize, csize = struct.unpack('<II', header)
+    if usize == 0 and csize == 0:
+        return None  # End marker
+    compressed = f.read(csize)
+    if len(compressed) < csize:
+        raise ValueError(f"Truncated chunk: expected {csize} bytes, got {len(compressed)}")
+    return lz4.block.decompress(compressed, uncompressed_size=usize)
+
+
+def decode_schema(data: bytes) -> Dict[str, Any]:
+    """Decode schema chunk into Python dict."""
+    schema = msgpack.unpackb(data, raw=False)
+    return schema
+
+
+def iter_rows(data: bytes) -> Iterator[List[Any]]:
+    """Iterate over rows in a data chunk."""
+    unpacker = msgpack.Unpacker(raw=False)
+    unpacker.feed(data)
+    yield from unpacker
+
+
+def decode_trace(filepath: str) -> Tuple[int, Dict[str, Any], List[List[Any]]]:
+    """Decode entire trace file, return (version, schema, rows)."""
+    with open(filepath, 'rb') as f:
+        version = read_header(f)
+
+        # Read schema chunk
+        schema_data = read_chunk(f)
+        if schema_data is None:
+            raise ValueError("Missing schema chunk")
+        schema = decode_schema(schema_data)
+
+        # Read all data chunks
+        rows = []
+        while True:
+            chunk_data = read_chunk(f)
+            if chunk_data is None:
+                break
+            rows.extend(iter_rows(chunk_data))
+
+        return version, schema, rows
+
+
+def get_event_name(event_type: int) -> str:
+    """Convert event type to name."""
+    return {0: 'ENTER', 1: 'EXIT', 2: 'FORK'}.get(event_type, f'UNKNOWN({event_type})')
+
+
+def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]:
+    """Format a row as a dict using the appropriate schema."""
+    if len(row) < 3:
+        return {'_raw': row}
+
+    event_type = row[2]
+    event_schemas = schema.get('event_schemas', {})
+    columns = event_schemas.get(str(event_type), [])
+
+    if not columns:
+        # Fallback for old format with 'columns' key
+        columns = schema.get('columns', [])
+
+    result = {}
+    for i, val in enumerate(row):
+        if i < len(columns):
+            key = columns[i]
+            if key == 'event':
+                result[key] = get_event_name(val)
+            else:
+                result[key] = val
+        else:
+            result[f'_col{i}'] = val
+
+    return result
+
+
+def print_schema(schema: Dict[str, Any], version: int) -> None:
+    """Print schema information."""
+    print(f"Format Version: {version}")
+    print(f"Format Name: {schema.get('format', 'unknown')}")
+    print(f"Schema Version: {schema.get('version', 'unknown')}")
+    print()
+
+    if 'event_schemas' in schema:
+        print("Event Schemas (discriminated union):")
+        for event_type, columns in sorted(schema['event_schemas'].items()):
+            event_name = get_event_name(int(event_type))
+            print(f"  {event_type} ({event_name}): {columns}")
+    elif 'columns' in schema:
+        print(f"Columns: {schema['columns']}")
+    print()
+
+
+def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
+    """Print statistics about the trace."""
+    print(f"Total rows: {len(rows):,}")
+
+    if not rows:
+        return
+
+    # Count by event type
+    event_counts = Counter(row[2] for row in rows if len(row) > 2)
+    print("\nEvents by type:")
+    for event_type, count in sorted(event_counts.items()):
+        event_name = get_event_name(event_type)
+        pct = 100 * count / len(rows)
+        print(f"  {event_name}: {count:,} ({pct:.1f}%)")
+
+    # Thread stats
+    thread_ids = set(row[1] for row in rows if len(row) > 1)
+    print(f"\nThreads: {len(thread_ids)} ({sorted(thread_ids)})")
+
+    # Sequence range
+    seqs = [row[0] for row in rows if len(row) > 0]
+    if seqs:
+        print(f"Sequence range: {min(seqs):,} - {max(seqs):,}")
+
+    # Function stats (for ENTER/EXIT events)
+    fn_counts = Counter()
+    for row in rows:
+        if len(row) > 3 and row[2] in (0, 1):  # ENTER or EXIT
+            fn_counts[row[3]] += 1
+
+    if fn_counts:
+        print(f"\nTop 10 functions by event count:")
+        for fn, count in fn_counts.most_common(10):
+            print(f"  {count:8,}  {fn}")
+
+    # FORK events
+    fork_rows = [row for row in rows if len(row) > 2 and row[2] == 2]
+    if fork_rows:
+        print(f"\nFork events: {len(fork_rows)}")
+        for row in fork_rows[:5]:
+            formatted = format_row(row, schema)
+            child_pid = formatted.get('child_pid', 'unknown')
+            print(f"  seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_pid={child_pid}")
+
+
+def print_rows(rows: List[List[Any]], schema: Dict[str, Any],
+               head: int | None = None, raw: bool = False, as_json: bool = False) -> None:
+    """Print rows in various formats."""
+    display_rows = rows[:head] if head else rows
+
+    if as_json:
+        output = [format_row(row, schema) for row in display_rows]
+        print(json.dumps(output, indent=2))
+        return
+
+    for row in display_rows:
+        if raw:
+            print(row)
+        else:
+            formatted = format_row(row, schema)
+            # Compact single-line format
+            parts = []
+            for k, v in formatted.items():
+                if isinstance(v, str) and len(v) > 50:
+                    v = v[:47] + '...'
+                parts.append(f"{k}={v}")
+            print(' | '.join(parts))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Decode and debug tracegrind MsgPack+LZ4 trace files.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument('file', help='Trace file to decode (.msgpack.lz4)')
+    parser.add_argument('--schema', action='store_true',
+                        help='Print schema information only')
+    parser.add_argument('--stats', action='store_true',
+                        help='Print statistics about the trace')
+    parser.add_argument('--head', type=int, metavar='N',
+                        help='Print only first N rows')
+    parser.add_argument('--tail', type=int, metavar='N',
+                        help='Print only last N rows')
+    parser.add_argument('--raw', action='store_true',
+                        help='Print raw row arrays')
+    parser.add_argument('--json', action='store_true',
+                        help='Output as JSON')
+    parser.add_argument('--event', type=str, choices=['ENTER', 'EXIT', 'FORK'],
+                        help='Filter by event type')
+    parser.add_argument('--fn', type=str, metavar='PATTERN',
+                        help='Filter by function name (substring match)')
+
+    args = parser.parse_args()
+
+    try:
+        version, schema, rows = decode_trace(args.file)
+    except Exception as e:
+        print(f"Error reading trace file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    # Schema only mode
+    if args.schema:
+        print_schema(schema, version)
+        sys.exit(0)
+
+    # Apply filters
+    filtered_rows = rows
+
+    if args.event:
+        event_map = {'ENTER': 0, 'EXIT': 1, 'FORK': 2}
+        event_type = event_map[args.event]
+        filtered_rows = [r for r in filtered_rows if len(r) > 2 and r[2] == event_type]
+
+    if args.fn:
+        pattern = args.fn.lower()
+        filtered_rows = [r for r in filtered_rows
+                        if len(r) > 3 and isinstance(r[3], str) and pattern in r[3].lower()]
+
+    # Stats mode
+    if args.stats:
+        print_schema(schema, version)
+        print_stats(filtered_rows, schema)
+        sys.exit(0)
+
+    # Default: print rows
+    if args.tail:
+        filtered_rows = filtered_rows[-args.tail:]
+
+    print_schema(schema, version)
+    print(f"Showing {min(args.head or len(filtered_rows), len(filtered_rows)):,} of {len(filtered_rows):,} rows")
+    print("-" * 80)
+    print_rows(filtered_rows, schema, head=args.head, raw=args.raw, as_json=args.json)
+
+
+if __name__ == '__main__':
+    main()

From b4da8df620ee294760cea1d94db17957935ef8c2 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 00:23:18 +0000
Subject: [PATCH 05/26] feat(bench): add tracegrind benchmarks for comparison
 with callgrind

Add tracegrind configurations to the benchmark suite:
- tracegrind/default: basic tracing
- tracegrind/cache-sim: with cache simulation
- tracegrind/cache-sim+systime: with cache sim and syscall timing

This allows direct performance comparison between callgrind and tracegrind.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 bench/bench.py | 63 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/bench/bench.py b/bench/bench.py
index 18e2c472a..f803e0015 100755
--- a/bench/bench.py
+++ b/bench/bench.py
@@ -41,16 +41,17 @@ def __init__(
             raise RuntimeError(f"Valgrind not found at: {self.valgrind_path}")
         self.valgrind_version = result.stdout.strip()
 
-    def run_valgrind(self, *args: str) -> None:
-        """Execute valgrind with given arguments.
+    def run_valgrind(self, tool: str, *args: str) -> None:
+        """Execute valgrind with given tool and arguments.
 
         Args:
+            tool: Valgrind tool to use (callgrind, tracegrind)
             *args: Valgrind arguments
         """
 
         cmd = [
             self.valgrind_path,
-            "--tool=callgrind",
+            f"--tool={tool}",
             "--log-file=/dev/null",
             *args,
             *shlex.split(self.cmd),
@@ -77,16 +78,19 @@ def runner(request):
 
 def pytest_generate_tests(metafunc):
     """Parametrize tests with valgrind configurations."""
-    if "valgrind_args" in metafunc.fixturenames:
+    if "tool_and_args" in metafunc.fixturenames:
         runner = getattr(metafunc.config, "_valgrind_runner", None)
         if not runner:
             return
 
-        # Define valgrind configurations
+        # Define configurations for each tool
+        # Format: (tool, args, config_name)
         configs = [
-            (["--read-inline-info=no"], "no-inline"),
-            (["--read-inline-info=yes"], "inline"),
+            # Callgrind configurations
+            ("callgrind", ["--read-inline-info=no"], "callgrind/no-inline"),
+            ("callgrind", ["--read-inline-info=yes"], "callgrind/inline"),
             (
+                "callgrind",
                 [
                     "--trace-children=yes",
                     "--cache-sim=yes",
@@ -99,9 +103,10 @@ def pytest_generate_tests(metafunc):
                     "--dump-line=no",
                     "--read-inline-info=yes",
                 ],
-                "full-with-inline",
+                "callgrind/full-with-inline",
             ),
             (
+                "callgrind",
                 [
                     "--trace-children=yes",
                     "--cache-sim=yes",
@@ -113,38 +118,62 @@ def pytest_generate_tests(metafunc):
                     "--combine-dumps=yes",
                     "--dump-line=no",
                 ],
-                "full-no-inline",
+                "callgrind/full-no-inline",
+            ),
+            # Tracegrind configurations
+            ("tracegrind", [], "tracegrind/default"),
+            (
+                "tracegrind",
+                [
+                    "--cache-sim=yes",
+                    "--I1=32768,8,64",
+                    "--D1=32768,8,64",
+                    "--LL=8388608,16,64",
+                ],
+                "tracegrind/cache-sim",
+            ),
+            (
+                "tracegrind",
+                [
+                    "--cache-sim=yes",
+                    "--I1=32768,8,64",
+                    "--D1=32768,8,64",
+                    "--LL=8388608,16,64",
+                    "--collect-systime=nsec",
+                ],
+                "tracegrind/cache-sim+systime",
             ),
         ]
 
         # If the valgrind version is from CodSpeed, we don't want to display the exact version
-        # to allow comparison against older versions. 
+        # to allow comparison against older versions.
         if ".codspeed" in runner.valgrind_version:
             runner.valgrind_version = "valgrind.codspeed"
 
         # Create test IDs with format: valgrind-version, command, config-name
         test_ids = [
             f"{runner.valgrind_version}, {runner.cmd}, {config_name}"
-            for _, config_name in configs
+            for _, _, config_name in configs
         ]
 
-        # Parametrize with just the args
+        # Parametrize with (tool, args) tuples
         metafunc.parametrize(
-            "valgrind_args",
-            [args for args, _ in configs],
+            "tool_and_args",
+            [(tool, args) for tool, args, _ in configs],
             ids=test_ids,
         )
 
 
 @pytest.mark.benchmark
-def test_valgrind(runner, valgrind_args):
+def test_valgrind(runner, tool_and_args):
     if runner:
-        runner.run_valgrind(*valgrind_args)
+        tool, args = tool_and_args
+        runner.run_valgrind(tool, *args)
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Benchmark Valgrind with pytest-codspeed",
+        description="Benchmark Valgrind tools (callgrind, tracegrind) with pytest-codspeed",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:

From 38e24d9da51f5efbd5a87ea0f66cc913742cc416 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 00:30:58 +0000
Subject: [PATCH 06/26] fix(bench): skip tracegrind tests when tool not
 available

Detect available tools at startup and only run benchmarks for tools
that are present. This fixes CI failures when running against upstream
valgrind which doesn't have tracegrind.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 bench/bench.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/bench/bench.py b/bench/bench.py
index f803e0015..06ff6b66f 100755
--- a/bench/bench.py
+++ b/bench/bench.py
@@ -41,6 +41,22 @@ def __init__(
             raise RuntimeError(f"Valgrind not found at: {self.valgrind_path}")
         self.valgrind_version = result.stdout.strip()
 
+        # Check which tools are available
+        self.available_tools = self._detect_available_tools()
+
+    def _detect_available_tools(self) -> set:
+        """Detect which valgrind tools are available."""
+        tools = set()
+        for tool in ["callgrind", "tracegrind"]:
+            result = subprocess.run(
+                [self.valgrind_path, f"--tool={tool}", "--help"],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode == 0:
+                tools.add(tool)
+        return tools
+
     def run_valgrind(self, tool: str, *args: str) -> None:
         """Execute valgrind with given tool and arguments.
 
@@ -85,7 +101,7 @@ def pytest_generate_tests(metafunc):
 
         # Define configurations for each tool
         # Format: (tool, args, config_name)
-        configs = [
+        all_configs = [
             # Callgrind configurations
             ("callgrind", ["--read-inline-info=no"], "callgrind/no-inline"),
             ("callgrind", ["--read-inline-info=yes"], "callgrind/inline"),
@@ -120,7 +136,7 @@ def pytest_generate_tests(metafunc):
                 ],
                 "callgrind/full-no-inline",
             ),
-            # Tracegrind configurations
+            # Tracegrind configurations (only available in codspeed fork)
             ("tracegrind", [], "tracegrind/default"),
             (
                 "tracegrind",
@@ -145,6 +161,16 @@ def pytest_generate_tests(metafunc):
             ),
         ]
 
+        # Filter configs to only include available tools
+        configs = [
+            (tool, args, name)
+            for tool, args, name in all_configs
+            if tool in runner.available_tools
+        ]
+
+        if not configs:
+            return
+
         # If the valgrind version is from CodSpeed, we don't want to display the exact version
         # to allow comparison against older versions.
         if ".codspeed" in runner.valgrind_version:
@@ -208,6 +234,7 @@ def main():
         valgrind_path=args.valgrind_path,
     )
     print(f"Valgrind version: {runner.valgrind_version}")
+    print(f"Available tools: {', '.join(sorted(runner.available_tools))}")
     print(f"Command: {args.cmd}")
 
     # Plugin to pass runner to tests

From 1a04cd74eca206d9f99d57ca0483be00be513a56 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 23:02:16 +0000
Subject: [PATCH 07/26] feat: add MARKER event type and remove legacy dump
 infrastructure

Add TRACEGRIND_ADD_MARKER client request that emits named marker
events (event=0) into the trace stream, renumbering ENTER=1, EXIT=2,
FORK=3. Remove the legacy dump_profile/zero_all_cost/dump_every_bb
machinery inherited from callgrind, replacing it with the simpler
compute_total_cost. Update the analyzer script (renamed from
decode-trace.py) to match the new event numbering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/callstack.c                        |  14 --
 tracegrind/clo.c                              |  38 ----
 tracegrind/docs/tracegrind-msgpack-format.md  |  40 +++--
 tracegrind/dump.c                             | 162 +++++-------------
 tracegrind/fn.c                               |   3 -
 tracegrind/global.h                           |  19 +-
 tracegrind/main.c                             | 125 ++++----------
 ...decode-trace.py => tracegrind-analyzer.py} |  10 +-
 tracegrind/threads.c                          |  13 --
 tracegrind/tracegrind.h                       |  48 +++---
 10 files changed, 139 insertions(+), 333 deletions(-)
 rename tracegrind/scripts/{decode-trace.py => tracegrind-analyzer.py} (96%)

diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index e1e8d84bf..8d262a88a 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -127,15 +127,6 @@ static void function_entered(fn_node* fn)
   }
 #endif		
 	    
-  if (fn->dump_before) {
-    HChar trigger[VG_(strlen)(fn->name) + 20];
-    VG_(sprintf)(trigger, "--dump-before=%s", fn->name);
-    TG_(dump_profile)(trigger, True);
-  }
-  else if (fn->zero_before) {
-    TG_(zero_all_cost)(True);
-  }
-
   if (fn->toggle_collect) {
     TG_(current_state).collect = !TG_(current_state).collect;
     TG_DEBUG(2,"   entering %s: toggled collection state to %s\n",
@@ -149,11 +140,6 @@ static void function_left(fn_node* fn)
 {
   TG_ASSERT(fn != 0);
 
-  if (fn->dump_after) {
-    HChar trigger[VG_(strlen)(fn->name) + 20];
-    VG_(sprintf)(trigger, "--dump-after=%s", fn->name);
-    TG_(dump_profile)(trigger, True);
-  }
   if (fn->toggle_collect) {
     TG_(current_state).collect = !TG_(current_state).collect;
     TG_DEBUG(2,"   leaving %s: toggled collection state to %s\n",
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index 5657d7d10..6c7ac6805 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -42,9 +42,6 @@
 
 /* Logging configuration for a function */
 struct _fn_config {
-    Int dump_before;
-    Int dump_after;
-    Int zero_before;
     Int toggle_collect;
 
     Int skip;    /* Handle CALL to this function as JMP (= Skip)? */
@@ -99,9 +96,6 @@ fn_config* new_fnc(void)
    fn_config* fnc = (fn_config*) TG_MALLOC("cl.clo.nf.1",
                                             sizeof(fn_config));
 
-   fnc->dump_before  = CONFIG_DEFAULT;
-   fnc->dump_after   = CONFIG_DEFAULT;
-   fnc->zero_before  = CONFIG_DEFAULT;
    fnc->toggle_collect = CONFIG_DEFAULT;
    fnc->skip         = CONFIG_DEFAULT;
    fnc->pop_on_jump  = CONFIG_DEFAULT;
@@ -315,15 +309,6 @@ static fn_config* get_fnc(const HChar* name)
 
 static void update_fn_config1(fn_node* fn, fn_config* fnc)
 {
-    if (fnc->dump_before != CONFIG_DEFAULT)
-	fn->dump_before = (fnc->dump_before == CONFIG_TRUE);
-
-    if (fnc->dump_after != CONFIG_DEFAULT)
-	fn->dump_after = (fnc->dump_after == CONFIG_TRUE);
-
-    if (fnc->zero_before != CONFIG_DEFAULT)
-	fn->zero_before = (fnc->zero_before == CONFIG_TRUE);
-
     if (fnc->toggle_collect != CONFIG_DEFAULT)
 	fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
 
@@ -439,21 +424,6 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
        TG_(clo).objs_to_skip[TG_(clo).objs_to_skip_count-1] = obj_name;
    }
 
-   else if VG_STR_CLO(arg, "--dump-before", tmp_str) {
-       fn_config* fnc = get_fnc(tmp_str);
-       fnc->dump_before = CONFIG_TRUE;
-   }
-
-   else if VG_STR_CLO(arg, "--zero-before", tmp_str) {
-       fn_config* fnc = get_fnc(tmp_str);
-       fnc->zero_before = CONFIG_TRUE;
-   }
-
-   else if VG_STR_CLO(arg, "--dump-after", tmp_str) {
-       fn_config* fnc = get_fnc(tmp_str);
-       fnc->dump_after = CONFIG_TRUE;
-   }
-
    else if VG_STR_CLO(arg, "--toggle-collect", tmp_str) {
        fn_config* fnc = get_fnc(tmp_str);
        fnc->toggle_collect = CONFIG_TRUE;
@@ -531,8 +501,6 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
    else if VG_BOOL_CLO(arg, "--dump-instr", TG_(clo).dump_instr) {}
    else if VG_BOOL_CLO(arg, "--dump-bb",    TG_(clo).dump_bb) {}
 
-   else if VG_INT_CLO( arg, "--dump-every-bb", TG_(clo).dump_every_bb) {}
-
    else if VG_BOOL_CLO(arg, "--collect-alloc",   TG_(clo).collect_alloc) {}
    else if VG_XACT_CLO(arg, "--collect-systime=no",
                        TG_(clo).collect_systime, systime_no) {}
@@ -590,10 +558,6 @@ void TG_(print_usage)(void)
 #endif
 
 "\n   activity options (for interactivity use tracegrind_control):\n"
-"    --dump-every-bb=<count>   Dump every <count> basic blocks [0=never]\n"
-"    --dump-before=<func>      Dump when entering function\n"
-"    --zero-before=<func>      Zero all costs when entering function\n"
-"    --dump-after=<func>       Dump when leaving function\n"
 #if TG_EXPERIMENTAL
 "    --dump-objs=no|yes        Dump static object information [no]\n"
 #endif
@@ -671,8 +635,6 @@ void TG_(set_clo_defaults)(void)
   TG_(clo).dump_bb          = False;
   TG_(clo).dump_bbs         = False;
 
-  TG_(clo).dump_every_bb    = 0;
-
   /* Collection */
   TG_(clo).separate_threads = False;
   TG_(clo).collect_atstart  = True;
diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index b2f6d6e94..fbf7b71dc 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -44,43 +44,54 @@ The first chunk contains a MsgPack map describing the discriminated union schema
     "version": 2,
     "format": "tracegrind-msgpack",
     "event_schemas": {
-        "0": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "0": ["seq", "tid", "event", "marker"],
         "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
-        "2": ["seq", "tid", "event", "child_pid"]
+        "2": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "3": ["seq", "tid", "event", "child_pid"]
     }
 }
 ```
 
 ### Event Types
 
-| Type | Name  | Description |
-|------|-------|-------------|
-| 0    | ENTER | Function entry |
-| 1    | EXIT  | Function exit |
-| 2    | FORK  | Child process created |
+| Type | Name   | Description |
+|------|--------|-------------|
+| 0    | MARKER | Named marker |
+| 1    | ENTER  | Function entry |
+| 2    | EXIT   | Function exit |
+| 3    | FORK   | Child process created |
 
 ### Row Schemas
 
-**ENTER/EXIT rows (event 0, 1):**
+**MARKER rows (event 0):**
+
+| Index | Name   | Type   | Description |
+|-------|--------|--------|-------------|
+| 0     | seq    | uint64 | Sequence number |
+| 1     | tid    | int32  | Thread ID |
+| 2     | event  | int    | 0 = MARKER |
+| 3     | marker | string | Marker label |
+
+**ENTER/EXIT rows (event 1, 2):**
 
 | Index | Name  | Type   | Description |
 |-------|-------|--------|-------------|
 | 0     | seq   | uint64 | Sequence number |
 | 1     | tid   | int32  | Thread ID |
-| 2     | event | int    | 0 = ENTER, 1 = EXIT |
+| 2     | event | int    | 1 = ENTER, 2 = EXIT |
 | 3     | fn    | string | Function name |
 | 4     | obj   | string | Shared object path |
 | 5     | file  | string | Source file path |
 | 6     | line  | int32  | Line number (0 if unknown) |
 | 7+    | ...   | int64  | Event counter deltas (Ir, Dr, Dw, etc.) |
 
-**FORK rows (event 2):**
+**FORK rows (event 3):**
 
 | Index | Name      | Type   | Description |
 |-------|-----------|--------|-------------|
 | 0     | seq       | uint64 | Sequence number |
 | 1     | tid       | int32  | Thread ID that called fork |
-| 2     | event     | int    | 2 = FORK |
+| 2     | event     | int    | 3 = FORK |
 | 3     | child_pid | int32  | PID of the new child process |
 
 ### Event Counter Columns
@@ -94,9 +105,10 @@ For ENTER/EXIT rows, event counters appear as delta values starting at index 7.
 Each data chunk contains concatenated MsgPack arrays. The row format depends on the event type (index 2):
 
 ```
-[seq, tid, 0, fn, obj, file, line, delta_Ir, ...]  # ENTER
-[seq, tid, 1, fn, obj, file, line, delta_Ir, ...]  # EXIT
-[seq, tid, 2, child_pid]                            # FORK
+[seq, tid, 0, marker]                               # MARKER
+[seq, tid, 1, fn, obj, file, line, delta_Ir, ...]   # ENTER
+[seq, tid, 2, fn, obj, file, line, delta_Ir, ...]   # EXIT
+[seq, tid, 3, child_pid]                             # FORK
 ```
 
 The reference implementation writes 4096 rows per chunk.
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index 871b31aec..a89b54a87 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -32,24 +32,11 @@
 #include "pub_tool_threadstate.h"
 #include "pub_tool_libcfile.h"
 
-/* ================================================================== */
-/* === Legacy dump state (kept for totals verification)           === */
-/* ================================================================== */
-
-static Int out_counter = 0;
-static HChar* out_file = 0;
-static Bool dumps_initialized = False;
-
-/* Total reads/writes/misses sum over all dumps and threads. */
+/* Total reads/writes/misses sum over all threads. */
 FullCost TG_(total_cost) = 0;
 
 EventMapping* TG_(dumpmap) = 0;
 
-Int TG_(get_dump_counter)(void)
-{
-  return out_counter;
-}
-
 /* ================================================================== */
 /* === Trace output                                                === */
 /* ================================================================== */
@@ -137,24 +124,32 @@ static void msgpack_write_header(void)
 
     /* event_schemas - discriminated union: each event type has its own schema */
     msgpack_write_key(&hdr, "event_schemas");
-    msgpack_write_map_header(&hdr, 3);  /* 3 event types: ENTER, EXIT, FORK */
+    msgpack_write_map_header(&hdr, 4);  /* 4 event types: MARKER, ENTER, EXIT, FORK */
 
-    /* Event type 0 (ENTER) schema */
+    /* Event type 0 (MARKER) schema */
     msgpack_write_key(&hdr, "0");
+    msgpack_write_array_header(&hdr, 4);
+    msgpack_write_str(&hdr, "seq", -1);
+    msgpack_write_str(&hdr, "tid", -1);
+    msgpack_write_str(&hdr, "event", -1);
+    msgpack_write_str(&hdr, "marker", -1);
+
+    /* Event type 1 (ENTER) schema */
+    msgpack_write_key(&hdr, "1");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
-    /* Event type 1 (EXIT) schema - same as ENTER */
-    msgpack_write_key(&hdr, "1");
+    /* Event type 2 (EXIT) schema - same as ENTER */
+    msgpack_write_key(&hdr, "2");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
-    /* Event type 2 (FORK) schema - minimal: seq, tid, event, child_pid */
-    msgpack_write_key(&hdr, "2");
+    /* Event type 3 (FORK) schema */
+    msgpack_write_key(&hdr, "3");
     msgpack_write_array_header(&hdr, 4);
     msgpack_write_str(&hdr, "seq", -1);
     msgpack_write_str(&hdr, "tid", -1);
@@ -291,6 +286,22 @@ static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid)
     }
 }
 
+/* Add a MARKER row to the msgpack output (seq, tid, event, marker_str) */
+static void msgpack_add_marker_row(ULong seq, Int tid, const HChar* marker)
+{
+    msgpack_write_array_header(&mp_state.buf, 4);
+    msgpack_write_uint(&mp_state.buf, seq);
+    msgpack_write_int(&mp_state.buf, tid);
+    msgpack_write_int(&mp_state.buf, TG_EV_MARKER);
+    msgpack_write_str(&mp_state.buf, marker, -1);
+
+    mp_state.rows_in_chunk++;
+
+    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+        msgpack_flush_chunk();
+    }
+}
+
 /* Close msgpack output */
 static void msgpack_close_output(void)
 {
@@ -444,6 +455,16 @@ void TG_(trace_emit_fork)(ThreadId tid, Int child_pid)
     msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid);
 }
 
+void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker)
+{
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
+
+    TG_(trace_out).seq++;
+
+    msgpack_add_marker_row(TG_(trace_out).seq, (Int)tid, marker);
+}
+
 void TG_(trace_close_output)(void)
 {
     if (!TG_(trace_out).initialized) return;
@@ -463,78 +484,14 @@ void TG_(trace_close_output)(void)
 }
 
 
-/* ================================================================== */
-/* === Simplified dump (totals only, for verification)            === */
-/* ================================================================== */
-
-/* Command buffer for dump header */
-static HChar *cmdbuf;
-
-static void init_cmdbuf(void)
-{
-  SizeT size;
-  Int i,j;
-
-  size  = 1;
-  size += VG_(strlen)( VG_(args_the_exename) );
-  for (i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
-     const HChar *arg = *(HChar**)VG_(indexXA)( VG_(args_for_client), i );
-     size += 1;
-     for(j=0; arg[j]; j++)
-       switch(arg[j]) {
-       case '\n':
-       case '\\':
-	 size++;
-	 /* fallthrough */
-       default:
-	 size++;
-       }
-  }
-
-  cmdbuf = TG_MALLOC("tg.dump.ic.1", size + 1);
-
-  size = VG_(sprintf)(cmdbuf, " %s", VG_(args_the_exename));
-
-  for(i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
-     const HChar *arg = * (HChar**) VG_(indexXA)( VG_(args_for_client), i );
-     cmdbuf[size++] = ' ';
-     for(j=0; arg[j]; j++)
-       switch(arg[j]) {
-       case '\n':
-	 cmdbuf[size++] = '\\';
-	 cmdbuf[size++] = 'n';
-	 break;
-       case '\\':
-	 cmdbuf[size++] = '\\';
-	 cmdbuf[size++] = '\\';
-	 break;
-       default:
-	 cmdbuf[size++] = arg[j];
-	 break;
-       }
-  }
-  cmdbuf[size] = '\0';
-}
-
-
-/* Dump profile now only computes totals (no callgraph output).
- * The real output is the streaming CSV trace.
- */
-void TG_(dump_profile)(const HChar* trigger, Bool only_current_thread)
+/* Sum costs from all threads into total_cost */
+void TG_(compute_total_cost)(void)
 {
-   TG_DEBUG(2, "+ dump_profile(Trigger '%s')\n",
-	    trigger ? trigger : "Prg.Term.");
-
-   TG_(init_dumps)();
-   out_counter++;
-
-   /* Compute totals from all threads */
    if (!TG_(total_cost)) {
        TG_(total_cost) = TG_(get_eventset_cost)(TG_(sets).full);
        TG_(init_cost)(TG_(sets).full, TG_(total_cost));
    }
 
-   /* Sum costs from all threads into total_cost */
    {
        Int t;
        thread_info** thr = TG_(get_threads)();
@@ -543,41 +500,8 @@ void TG_(dump_profile)(const HChar* trigger, Bool only_current_thread)
            TG_(add_diff_cost)(TG_(sets).full, TG_(total_cost),
                               thr[t]->lastdump_cost,
                               thr[t]->states.entry[0]->cost);
-           /* Update lastdump_cost */
            TG_(copy_cost)(TG_(sets).full, thr[t]->lastdump_cost,
                           thr[t]->states.entry[0]->cost);
        }
    }
-
-   if (VG_(clo_verbosity) > 1)
-       VG_(message)(Vg_DebugMsg, "Dump done (trigger: %s).\n",
-                    trigger ? trigger : "Prg.Term.");
-}
-
-
-void TG_(init_dumps)(void)
-{
-   static int thisPID = 0;
-   int currentPID = VG_(getpid)();
-   if (currentPID == thisPID) {
-       TG_ASSERT(out_file != 0);
-       return;
-   }
-   thisPID = currentPID;
-
-   if (!TG_(clo).out_format)
-     TG_(clo).out_format = DEFAULT_OUTFORMAT;
-
-   if (out_file) {
-       VG_(free)(out_file);
-       out_counter = 0;
-   }
-
-   out_file =
-       VG_(expand_file_name)("--tracegrind-out-file", TG_(clo).out_format);
-
-   if (!dumps_initialized)
-       init_cmdbuf();
-
-   dumps_initialized = True;
 }
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index 36ab8d394..f9802c10c 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -447,9 +447,6 @@ fn_node* new_fn_node(const HChar *fnname,
     fn->file     = file;
     fn->next     = next;
 
-    fn->dump_before  = False;
-    fn->dump_after   = False;
-    fn->zero_before  = False;
     fn->toggle_collect = False;
     fn->skip         = False;
     fn->obj_skip_checked = False;
diff --git a/tracegrind/global.h b/tracegrind/global.h
index a9c8649d2..8b0101b09 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -85,9 +85,10 @@ typedef enum {
 
 /* Trace event types */
 typedef enum {
-   TG_EV_ENTER = 0,
-   TG_EV_EXIT  = 1,
-   TG_EV_FORK  = 2
+   TG_EV_MARKER = 0,
+   TG_EV_ENTER  = 1,
+   TG_EV_EXIT   = 2,
+   TG_EV_FORK   = 3
 } TraceEventType;
 
 typedef struct _CommandLineOptions CommandLineOptions;
@@ -106,9 +107,6 @@ struct _CommandLineOptions {
   Bool dump_bb;
   Bool dump_bbs;         /* Dump basic block information? */
   
-  /* Dump generation options */
-  ULong dump_every_bb;     /* Dump every xxx BBs. */
-  
   /* Collection options */
   Bool separate_threads; /* Separate threads in dump? */
   Int  separate_callers; /* Separate dependent on how many callers? */
@@ -419,9 +417,6 @@ struct _fn_node {
   file_node* file;     /* reverse mapping for 2nd hash */
   fn_node* next;
 
-  Bool dump_before :1;
-  Bool dump_after :1;
-  Bool zero_before :1;
   Bool toggle_collect :1;
   Bool skip :1;
   Bool obj_skip_checked : 1;
@@ -726,9 +721,7 @@ Bool TG_(get_debug_info)(Addr, const HChar **dirname,
                           const HChar **fn_name, UInt*, DebugInfo**);
 void TG_(collectBlockInfo)(IRSB* bbIn, UInt*, UInt*, Bool*);
 void TG_(set_instrument_state)(const HChar*,Bool);
-void TG_(dump_profile)(const HChar* trigger,Bool only_current_thread);
-void TG_(zero_all_cost)(Bool only_current_thread);
-Int TG_(get_dump_counter)(void);
+void TG_(compute_total_cost)(void);
 void TG_(fini)(Int exitcode);
 
 /* from bb.c */
@@ -810,13 +803,13 @@ void TG_(post_signal)(ThreadId tid, Int sigNum);
 void TG_(run_post_signal_on_call_stack_bottom)(void);
 
 /* from dump.c */
-void TG_(init_dumps)(void);
 
 /* Trace output (from dump.c) */
 void TG_(trace_open_output)(void);
 void TG_(trace_reopen_child)(void);
 void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
 void TG_(trace_emit_fork)(ThreadId tid, Int child_pid);
+void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker);
 void TG_(trace_close_output)(void);
 
 /*------------------------------------------------------------*/
diff --git a/tracegrind/main.c b/tracegrind/main.c
index 8b5e62ecb..1d9a6fc1f 100644
--- a/tracegrind/main.c
+++ b/tracegrind/main.c
@@ -704,8 +704,8 @@ void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
                                 ea, mkIRExpr_HWord( datasize ) );
    regparms    = 3;
    di          = unsafeIRDirty_0_N(
-                    regparms, 
-                    helperName, VG_(fnptr_to_fnentry)( helperAddr ), 
+                    regparms,
+                    helperName, VG_(fnptr_to_fnentry)( helperAddr ),
                     argv );
    di->guard = guard;
    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
@@ -908,7 +908,7 @@ void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
 					     IRConst_U32( addr ) :
 					     IRConst_U64( addr )),
 				IRExpr_Const(IRConst_U32(val)) ));
-}   
+}
 
 
 /* add helper call to setup_bbcc, with pointer to BB struct as argument
@@ -993,7 +993,7 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
    TG_ASSERT(Ist_IMark == st->tag);
 
    origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta;
-   TG_ASSERT(origAddr == st->Ist.IMark.addr 
+   TG_ASSERT(origAddr == st->Ist.IMark.addr
                           + st->Ist.IMark.delta);  // XXX: check no overflow
 
    /* Get BB struct (creating if necessary).
@@ -1389,41 +1389,6 @@ void tg_discard_superblock_info ( Addr orig_addr, VexGuestExtents vge )
 
 
 
-static void zero_thread_cost(thread_info* t)
-{
-  Int i;
-
-  for(i = 0; i < TG_(current_call_stack).sp; i++) {
-    if (!TG_(current_call_stack).entry[i].jcc) continue;
-
-    /* reset call counters to current for active calls */
-    TG_(copy_cost)( TG_(sets).full, 
-		    TG_(current_call_stack).entry[i].enter_cost,
-		    TG_(current_state).cost );
-    TG_(current_call_stack).entry[i].jcc->call_counter = 0;
-  }
-
-  TG_(forall_bbccs)(TG_(zero_bbcc));
-
-  /* set counter for last dump */
-  TG_(copy_cost)( TG_(sets).full, 
-		  t->lastdump_cost, TG_(current_state).cost );
-}
-
-void TG_(zero_all_cost)(Bool only_current_thread)
-{
-  if (VG_(clo_verbosity) > 1)
-    VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
-
-  if (only_current_thread)
-    zero_thread_cost(TG_(get_current_thread)());
-  else
-    TG_(forall_threads)(zero_thread_cost);
-
-  if (VG_(clo_verbosity) > 1)
-    VG_(message)(Vg_DebugMsg, "  ...done\n");
-}
-
 static
 void unwind_thread(thread_info* t)
 {
@@ -1494,18 +1459,18 @@ static void dump_state_of_thread_togdb(thread_info* ti)
       ce = TG_(get_call_entry)(i);
       /* if this frame is skipped, we don't have counters */
       if (!ce->jcc) continue;
-      
+
       from = ce->jcc->from;
       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
       VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
-      
+
       /* FIXME: EventSets! */
       TG_(copy_cost)( TG_(sets).full, sum, ce->jcc->cost );
       TG_(copy_cost)( TG_(sets).full, tmp, ce->enter_cost );
       TG_(add_diff_cost)( TG_(sets).full, sum,
 			  ce->enter_cost, TG_(current_state).cost );
       TG_(copy_cost)( TG_(sets).full, ce->enter_cost, tmp );
-      
+
       mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
       VG_(gdb_printf)("events-%d-%d: %s\n",t, i, mcost);
       VG_(free)(mcost);
@@ -1538,9 +1503,8 @@ static void dump_state_togdb(void)
     HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
     VG_(gdb_printf)("events: %s\n", evmap);
     VG_(free)(evmap);
-    /* "part:" line (number of last part. Is 0 at start */
-    VG_(gdb_printf)("part: %d\n", TG_(get_dump_counter)());
-		
+    /* Total cost summary */
+
     /* threads */
     th = TG_(get_threads)();
     VG_(gdb_printf)("threads:");
@@ -1553,15 +1517,11 @@ static void dump_state_togdb(void)
     TG_(forall_threads)(dump_state_of_thread_togdb);
 }
 
-  
+
 static void print_monitor_help ( void )
 {
    VG_(gdb_printf) ("\n");
    VG_(gdb_printf) ("tracegrind monitor commands:\n");
-   VG_(gdb_printf) ("  dump [<dump_hint>]\n");
-   VG_(gdb_printf) ("        dump counters\n");
-   VG_(gdb_printf) ("  zero\n");
-   VG_(gdb_printf) ("        zero counters\n");
    VG_(gdb_printf) ("  status\n");
    VG_(gdb_printf) ("        print status\n");
    VG_(gdb_printf) ("  instrumentation [on|off]\n");
@@ -1579,7 +1539,7 @@ static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
    VG_(strcpy) (s, req);
 
    wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
-   switch (VG_(keyword_id) ("help dump zero status instrumentation", 
+   switch (VG_(keyword_id) ("help status instrumentation",
                             wcmd, kwd_report_duplicated_matches)) {
    case -2: /* multiple matches */
       return True;
@@ -1588,16 +1548,8 @@ static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
    case  0: /* help */
       print_monitor_help();
       return True;
-   case  1: { /* dump */
-      TG_(dump_profile)(req, False);
-      return True;
-   }
-   case  2: { /* zero */
-      TG_(zero_all_cost)(False);
-      return True;
-   }
 
-   case 3: { /* status */
+   case 1: { /* status */
      HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
        /* internal interface to tracegrind_control */
@@ -1618,7 +1570,7 @@ static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
      return True;
    }
 
-   case 4: { /* instrumentation */
+   case 2: { /* instrumentation */
      HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
      if (!arg) {
        VG_(gdb_printf)("instrumentation: %s\n",
@@ -1629,7 +1581,7 @@ static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
      return True;
    }
 
-   default: 
+   default:
       tl_assert(0);
       return False;
    }
@@ -1643,26 +1595,6 @@ Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
       return False;
 
    switch(args[0]) {
-   case VG_USERREQ__DUMP_STATS:     
-      TG_(dump_profile)("Client Request", True);
-      *ret = 0;                 /* meaningless */
-      break;
-
-   case VG_USERREQ__DUMP_STATS_AT:
-     {
-       const HChar *arg = (HChar*)args[1];
-       HChar buf[30 + VG_(strlen)(arg)];    // large enough
-       VG_(sprintf)(buf,"Client Request: %s", arg);
-       TG_(dump_profile)(buf, True);
-       *ret = 0;                 /* meaningless */
-     }
-     break;
-
-   case VG_USERREQ__ZERO_STATS:
-     TG_(zero_all_cost)(True);
-      *ret = 0;                 /* meaningless */
-      break;
-
    case VG_USERREQ__TOGGLE_COLLECT:
      TG_(current_state).collect = !TG_(current_state).collect;
      TG_DEBUG(2, "Client Request: toggled collection state to %s\n",
@@ -1670,6 +1602,15 @@ Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
      *ret = 0;                 /* meaningless */
      break;
 
+   case VG_USERREQ__ADD_MARKER:
+     {
+       const HChar *marker = (HChar*)args[1];
+       TG_DEBUG(2, "Client Request: add marker '%s'\n", marker);
+       TG_(trace_emit_marker)(tid, marker);
+       *ret = 0;                 /* meaningless */
+     }
+     break;
+
    case VG_USERREQ__START_INSTRUMENTATION:
      TG_(set_instrument_state)("Client Request", True);
      *ret = 0;                 /* meaningless */
@@ -1688,6 +1629,12 @@ Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
          *ret = 0;
       return handled;
    }
+   case VG_USERREQ__DUMP_STATS:
+   case VG_USERREQ__ZERO_STATS:
+     TG_DEBUG(2, "Client Request: ignoring  %llx\n", (ULong)args[0]);
+     *ret = 0;                 /* meaningless */
+     break;
+
    default:
       VG_(message)(Vg_UserMsg,
                    "Warning: unknown tracegrind client request code %llx\n",
@@ -1972,13 +1919,13 @@ void finish(void)
    */
   TG_(forall_threads)(unwind_thread);
 
-  TG_(dump_profile)(0, False);
+  TG_(compute_total_cost)();
 
   /* Close CSV trace output */
   TG_(trace_close_output)();
 
   if (VG_(clo_verbosity) == 0) return;
-  
+
   if (VG_(clo_stats)) {
     VG_(message)(Vg_DebugMsg, "\n");
     tg_print_stats();
@@ -2064,7 +2011,7 @@ void TG_(post_clo_init)(void)
       TG_DEBUG(1, " Using user specified value for "
                 "--vex-iropt-register-updates\n");
    } else {
-      TG_DEBUG(1, 
+      TG_DEBUG(1,
                 " Using default --vex-iropt-register-updates="
                 "sp-at-mem-access\n");
    }
@@ -2094,13 +2041,13 @@ void TG_(post_clo_init)(void)
       TG_DEBUG(1, " Using user specified value for "
                 "--px-file-backed\n");
    } else {
-      TG_DEBUG(1, 
+      TG_DEBUG(1,
                 " Using default --px-file-backed="
                 "sp-at-mem-access\n");
    }
 
    if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
-      VG_(message)(Vg_UserMsg, 
+      VG_(message)(Vg_UserMsg,
                    "tracegrind only works with --vex-iropt-unroll-thresh=0\n"
                    "=> resetting it back to 0\n");
       VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
@@ -2111,7 +2058,7 @@ void TG_(post_clo_init)(void)
                    "=> resetting it back to 'no'\n");
       VG_(clo_vex_control).guest_chase = False; // cannot be overridden.
    }
-   
+
    TG_DEBUG(1, "  dump threads: %s\n", TG_(clo).separate_threads ? "Yes":"No");
    TG_DEBUG(1, "  call sep. : %d\n", TG_(clo).separate_callers);
    TG_DEBUG(1, "  rec. sep. : %d\n", TG_(clo).separate_recursions);
@@ -2121,8 +2068,6 @@ void TG_(post_clo_init)(void)
        TG_(clo).dump_line = True;
    }
 
-   TG_(init_dumps)();
-
    (*TG_(cachesim).post_clo_init)();
 
    TG_(init_eventsets)();
diff --git a/tracegrind/scripts/decode-trace.py b/tracegrind/scripts/tracegrind-analyzer.py
similarity index 96%
rename from tracegrind/scripts/decode-trace.py
rename to tracegrind/scripts/tracegrind-analyzer.py
index 2b3fb4c31..b703c7fa9 100755
--- a/tracegrind/scripts/decode-trace.py
+++ b/tracegrind/scripts/tracegrind-analyzer.py
@@ -91,7 +91,7 @@ def decode_trace(filepath: str) -> Tuple[int, Dict[str, Any], List[List[Any]]]:
 
 def get_event_name(event_type: int) -> str:
     """Convert event type to name."""
-    return {0: 'ENTER', 1: 'EXIT', 2: 'FORK'}.get(event_type, f'UNKNOWN({event_type})')
+    return {0: 'MARKER', 1: 'ENTER', 2: 'EXIT', 3: 'FORK'}.get(event_type, f'UNKNOWN({event_type})')
 
 
 def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]:
@@ -165,7 +165,7 @@ def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
     # Function stats (for ENTER/EXIT events)
     fn_counts = Counter()
     for row in rows:
-        if len(row) > 3 and row[2] in (0, 1):  # ENTER or EXIT
+        if len(row) > 3 and row[2] in (1, 2):  # ENTER or EXIT
             fn_counts[row[3]] += 1
 
     if fn_counts:
@@ -174,7 +174,7 @@ def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
             print(f"  {count:8,}  {fn}")
 
     # FORK events
-    fork_rows = [row for row in rows if len(row) > 2 and row[2] == 2]
+    fork_rows = [row for row in rows if len(row) > 2 and row[2] == 3]
     if fork_rows:
         print(f"\nFork events: {len(fork_rows)}")
         for row in fork_rows[:5]:
@@ -226,7 +226,7 @@ def main():
                         help='Print raw row arrays')
     parser.add_argument('--json', action='store_true',
                         help='Output as JSON')
-    parser.add_argument('--event', type=str, choices=['ENTER', 'EXIT', 'FORK'],
+    parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER', 'EXIT', 'FORK'],
                         help='Filter by event type')
     parser.add_argument('--fn', type=str, metavar='PATTERN',
                         help='Filter by function name (substring match)')
@@ -248,7 +248,7 @@ def main():
     filtered_rows = rows
 
     if args.event:
-        event_map = {'ENTER': 0, 'EXIT': 1, 'FORK': 2}
+        event_map = {'MARKER': 0, 'ENTER': 1, 'EXIT': 2, 'FORK': 3}
         event_type = event_map[args.event]
         filtered_rows = [r for r in filtered_rows if len(r) > 2 and r[2] == event_type]
 
diff --git a/tracegrind/threads.c b/tracegrind/threads.c
index 800d1d1f2..960eb864d 100644
--- a/tracegrind/threads.c
+++ b/tracegrind/threads.c
@@ -181,19 +181,6 @@ void TG_(switch_thread)(ThreadId tid)
 
 void TG_(run_thread)(ThreadId tid)
 {
-    /* check for dumps needed */
-    static ULong bbs_done = 0;
-    HChar buf[50];   // large enough
-
-    if (TG_(clo).dump_every_bb >0) {
-       if (TG_(stat).bb_executions - bbs_done > TG_(clo).dump_every_bb) {
-           VG_(sprintf)(buf, "--dump-every-bb=%llu", TG_(clo).dump_every_bb);
-	   TG_(dump_profile)(buf, False);
-           bbs_done = TG_(stat).bb_executions;
-       }
-    }
-
-    /* now check for thread switch */
     TG_(switch_thread)(tid);
 }
 
diff --git a/tracegrind/tracegrind.h b/tracegrind/tracegrind.h
index 20b683fbf..0b37d11c8 100644
--- a/tracegrind/tracegrind.h
+++ b/tracegrind/tracegrind.h
@@ -75,44 +75,41 @@
 
 typedef
    enum {
-      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'),
-      VG_USERREQ__ZERO_STATS,
+      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'), // ignored
+      VG_USERREQ__ZERO_STATS, // ignored
       VG_USERREQ__TOGGLE_COLLECT,
-      VG_USERREQ__DUMP_STATS_AT,
+      VG_USERREQ__ADD_MARKER,
       VG_USERREQ__START_INSTRUMENTATION,
       VG_USERREQ__STOP_INSTRUMENTATION
    } Vg_TracegrindClientRequest;
 
-/* Dump current state of cost centers, and zero them afterwards */
-#define TRACEGRIND_DUMP_STATS                                    \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS,       \
-                                  0, 0, 0, 0, 0)
-
-/* Dump current state of cost centers, and zero them afterwards.
-   The argument is appended to a string stating the reason which triggered
-   the dump. This string is written as a description field into the
-   profile data dump. */
-#define TRACEGRIND_DUMP_STATS_AT(pos_str)                        \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS_AT,    \
-                                  pos_str, 0, 0, 0, 0)
-
-/* Zero cost centers */
-#define TRACEGRIND_ZERO_STATS                                    \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ZERO_STATS,       \
-                                  0, 0, 0, 0, 0)
-
 /* Toggles collection state.
    The collection state specifies whether the happening of events
    should be noted or if they are to be ignored. Events are noted
-   by increment of counters in a cost center */
+   by increment of counters in a cost center
+
+   Same as CALLGRIND_TOGGLE_COLLECT
+   */
 #define TRACEGRIND_TOGGLE_COLLECT                                \
   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT,   \
                                   0, 0, 0, 0, 0)
 
+/* Add a named marker into the trace output. The argument is a string
+   that will be recorded as a marker label.
+
+   Same as CALLGRIND_DUMP_STATS_AT
+   */
+#define TRACEGRIND_ADD_MARKER(marker_str)                           \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ADD_MARKER,          \
+                                  marker_str, 0, 0, 0, 0)
+
 /* Start full tracegrind instrumentation if not already switched on.
    When cache simulation is done, it will flush the simulated cache;
    this will lead to an artificial cache warmup phase afterwards with
-   cache misses which would not have happened in reality. */
+   cache misses which would not have happened in reality.
+
+   Same as CALLGRIND_START_INSTRUMENTATION
+   */
 #define TRACEGRIND_START_INSTRUMENTATION                              \
   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, \
                                   0, 0, 0, 0, 0)
@@ -123,7 +120,10 @@ typedef
    speed as the "none" tool (ie. at minimal slowdown).
    Use this to bypass Tracegrind aggregation for uninteresting code parts.
    To start Tracegrind in this mode to ignore the setup phase, use
-   the option "--instr-atstart=no". */
+   the option "--instr-atstart=no".
+
+   Same as CALLGRIND_STOP_INSTRUMENTATION
+   */
 #define TRACEGRIND_STOP_INSTRUMENTATION                               \
   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION,  \
                                   0, 0, 0, 0, 0)

From cafcab2534901538d0c18e19c287311e011e1cd0 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 23:19:49 +0000
Subject: [PATCH 08/26] refactor: remove dead callgrind remnants from
 tracegrind

Remove ~240 lines of unused code inherited from callgrind:
- Dead CLI options (combine-dumps, compress-*, dump-*, collect-alloc, etc.)
- Dead struct fields (jCC.creation_seq, BBCC.ret_counter, fn_node.is_malloc/is_realloc/is_free, etc.)
- Dead functions (forall_bbccs, zero_bbcc, cachesim_dump_desc, cachesim_add_icost)
- Dead types and typedefs (OutputFormat, fCC, SimCost, UserCost, AddrPos, AddrCost, FnPos)
- Dead EG_ALLOC event group and its registration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/bbcc.c      | 57 ---------------------------
 tracegrind/callstack.c |  4 --
 tracegrind/clo.c       | 54 +------------------------
 tracegrind/fn.c        |  9 -----
 tracegrind/global.h    | 89 ++++--------------------------------------
 tracegrind/jumps.c     |  4 +-
 tracegrind/main.c      |  7 +---
 tracegrind/sim.c       | 26 +-----------
 tracegrind/threads.c   | 10 ++---
 9 files changed, 16 insertions(+), 244 deletions(-)

diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
index af0210562..e9cb13596 100644
--- a/tracegrind/bbcc.c
+++ b/tracegrind/bbcc.c
@@ -75,62 +75,6 @@ void TG_(set_current_bbcc_hash)(bbcc_hash* h)
   current_bbccs.table   = h->table;
 }
 
-/*
- * Zero all costs of a BBCC
- */
-void TG_(zero_bbcc)(BBCC* bbcc)
-{
-  Int i;
-  jCC* jcc;
-
-  TG_ASSERT(bbcc->cxt != 0);
-  TG_DEBUG(1, "  zero_bbcc: BB %#lx, Cxt %u "
-	   "(fn '%s', rec %u)\n", 
-	   bb_addr(bbcc->bb),
-	   bbcc->cxt->base_number + bbcc->rec_index,
-	   bbcc->cxt->fn[0]->name,
-	   bbcc->rec_index);
-
-  if ((bbcc->ecounter_sum ==0) &&
-      (bbcc->ret_counter ==0)) return;
-
-  for(i=0;i<bbcc->bb->cost_count;i++)
-    bbcc->cost[i] = 0;
-  for(i=0;i <= bbcc->bb->cjmp_count;i++) {
-    bbcc->jmp[i].ecounter = 0;
-    for(jcc=bbcc->jmp[i].jcc_list; jcc; jcc=jcc->next_from) {
-      TG_(init_cost)( TG_(sets).full, jcc->cost );
-      jcc->call_counter = 0;
-    }
-  }
-  bbcc->ecounter_sum = 0;
-  bbcc->ret_counter = 0;
-}
-
-
-
-void TG_(forall_bbccs)(void (*func)(BBCC*))
-{
-  BBCC *bbcc, *bbcc2;
-  int i, j;
-	
-  for (i = 0; i < current_bbccs.size; i++) {
-    if ((bbcc=current_bbccs.table[i]) == NULL) continue;
-    while (bbcc) {
-      /* every bbcc should have a rec_array */
-      TG_ASSERT(bbcc->rec_array != 0);
-
-      for(j=0;j<bbcc->cxt->fn[0]->separate_recursions;j++) {
-	if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
-
-	(*func)(bbcc2);
-      }
-      bbcc = bbcc->next;
-    }
-  }
-}
-
-
 /* All BBCCs for recursion level 0 are inserted into a
  * thread specific hash table with key
  * - address of BB structure (unique, as never freed)
@@ -277,7 +221,6 @@ BBCC* new_bbcc(BB* bb)
    bbcc->bb  = bb;
    bbcc->tid = TG_(current_tid);
 
-   bbcc->ret_counter = 0;
    bbcc->skipped = 0;
    bbcc->cost = TG_(get_costarray)(bb->cost_count);
    for(i=0;i<bb->cost_count;i++)
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index 8d262a88a..f81e01e0c 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -338,10 +338,6 @@ void TG_(pop_call_stack)(void)
 				    lower_entry->enter_cost,
 				    TG_(current_state).cost) ) {
 	    
-	  /* only count this call if it attributed some cost.
-	   * the ret_counter is used to check if a BBCC dump is needed.
-	   */
-	  jcc->from->ret_counter++;
 	}
 	TG_(stat).ret_counter++;
 
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index 6c7ac6805..a470300be 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -399,18 +399,12 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
    /* compatibility alias, deprecated option */
    else if VG_BOOL_CLO(arg, "--trace-jump",    TG_(clo).collect_jumps) {}
 
-   else if VG_BOOL_CLO(arg, "--combine-dumps", TG_(clo).combine_dumps) {}
-
    else if VG_BOOL_CLO(arg, "--collect-atstart", TG_(clo).collect_atstart) {}
 
    else if VG_BOOL_CLO(arg, "--instr-atstart", TG_(clo).instrument_atstart) {}
 
    else if VG_BOOL_CLO(arg, "--separate-threads", TG_(clo).separate_threads) {}
 
-   else if VG_BOOL_CLO(arg, "--compress-strings", TG_(clo).compress_strings) {}
-   else if VG_BOOL_CLO(arg, "--compress-mangled", TG_(clo).compress_mangled) {}
-   else if VG_BOOL_CLO(arg, "--compress-pos",     TG_(clo).compress_pos) {}
-
    else if VG_STR_CLO(arg, "--fn-skip", tmp_str) {
        fn_config* fnc = get_fnc(tmp_str);
        fnc->skip = CONFIG_TRUE;
@@ -488,20 +482,9 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 
    else if VG_STR_CLO(arg, "--tracegrind-out-file", TG_(clo).out_format) {}
 
-   else if VG_XACT_CLO(arg, "--output-format=msgpack",
-                       TG_(clo).output_format, output_format_msgpack) {}
-
-   else if VG_BOOL_CLO(arg, "--mangle-names", TG_(clo).mangle_names) {}
-
    else if VG_BOOL_CLO(arg, "--skip-direct-rec",
                             TG_(clo).skip_direct_recursion) {}
 
-   else if VG_BOOL_CLO(arg, "--dump-bbs",   TG_(clo).dump_bbs) {}
-   else if VG_BOOL_CLO(arg, "--dump-line",  TG_(clo).dump_line) {}
-   else if VG_BOOL_CLO(arg, "--dump-instr", TG_(clo).dump_instr) {}
-   else if VG_BOOL_CLO(arg, "--dump-bb",    TG_(clo).dump_bb) {}
-
-   else if VG_BOOL_CLO(arg, "--collect-alloc",   TG_(clo).collect_alloc) {}
    else if VG_XACT_CLO(arg, "--collect-systime=no",
                        TG_(clo).collect_systime, systime_no) {}
    else if VG_XACT_CLO(arg, "--collect-systime=msec",
@@ -542,25 +525,8 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 void TG_(print_usage)(void)
 {
    VG_(printf)(
-"\n   dump creation options:\n"
+"\n   output options:\n"
 "    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p]\n"
-"    --dump-line=no|yes        Dump source lines of costs? [yes]\n"
-"    --dump-instr=no|yes       Dump instruction address of costs? [no]\n"
-"    --compress-strings=no|yes Compress strings in profile dump? [yes]\n"
-"    --compress-pos=no|yes     Compress positions in profile dump? [yes]\n"
-"    --combine-dumps=no|yes    Concat all dumps into same file [no]\n"
-#if TG_EXPERIMENTAL
-"    --compress-events=no|yes  Compress events in profile dump? [no]\n"
-"    --dump-bb=no|yes          Dump basic block address of costs? [no]\n"
-"    --dump-bbs=no|yes         Dump basic block info? [no]\n"
-"    --dump-skipped=no|yes     Dump info on skipped functions in calls? [no]\n"
-"    --mangle-names=no|yes     Mangle separation into names? [yes]\n"
-#endif
-
-"\n   activity options (for interactivity use tracegrind_control):\n"
-#if TG_EXPERIMENTAL
-"    --dump-objs=no|yes        Dump static object information [no]\n"
-#endif
 
 "\n   data collection options:\n"
 "    --instr-atstart=no|yes    Do instrumentation at tracegrind start [yes]\n"
@@ -568,9 +534,6 @@ void TG_(print_usage)(void)
 "    --toggle-collect=<func>   Toggle collection on enter/leave function\n"
 "    --collect-jumps=no|yes    Collect jumps? [no]\n"
 "    --collect-bus=no|yes      Collect global bus events? [no]\n"
-#if TG_EXPERIMENTAL
-"    --collect-alloc=no|yes    Collect memory allocation info? [no]\n"
-#endif
 "    --collect-systime=no|yes|msec|usec|nsec  Collect system call time info? [no]\n"
 "        no         Do not collect system call time info.\n"
 "        msec|yes   Collect syscount, syscall elapsed time (milli-seconds).\n"
@@ -622,24 +585,13 @@ void TG_(set_clo_defaults)(void)
 {
   /* Default values for command line arguments */
 
-  /* dump options */
+  /* Output */
   TG_(clo).out_format       = 0;
-  TG_(clo).combine_dumps    = False;
-  TG_(clo).compress_strings = True;
-  TG_(clo).compress_mangled = False;
-  TG_(clo).compress_events  = False;
-  TG_(clo).compress_pos     = True;
-  TG_(clo).mangle_names     = True;
-  TG_(clo).dump_line        = True;
-  TG_(clo).dump_instr       = False;
-  TG_(clo).dump_bb          = False;
-  TG_(clo).dump_bbs         = False;
 
   /* Collection */
   TG_(clo).separate_threads = False;
   TG_(clo).collect_atstart  = True;
   TG_(clo).collect_jumps    = False;
-  TG_(clo).collect_alloc    = False;
   TG_(clo).collect_systime  = systime_no;
   TG_(clo).collect_bus      = False;
 
@@ -662,6 +614,4 @@ void TG_(set_clo_defaults)(void)
   TG_(clo).verbose = 0;
   TG_(clo).verbose_start = 0;
 #endif
-
-  TG_(clo).output_format = output_format_msgpack;
 }
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index f9802c10c..1e866ec57 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -392,7 +392,6 @@ file_node* new_file_node(const HChar *filename,
     file->fns[i] = NULL;
   }
   TG_(stat).distinct_files++;
-  file->number  = TG_(stat).distinct_files;
   file->obj     = obj;
   file->next      = next;
   return file;
@@ -451,10 +450,6 @@ fn_node* new_fn_node(const HChar *fnname,
     fn->skip         = False;
     fn->obj_skip_checked = False;
     fn->pop_on_jump  = TG_(clo).pop_on_jump;
-    fn->is_malloc    = False;
-    fn->is_realloc   = False;
-    fn->is_free      = False;
-
     fn->group        = 0;
     fn->separate_callers    = TG_(clo).separate_callers;
     fn->separate_recursions = TG_(clo).separate_recursions;
@@ -673,10 +668,6 @@ fn_node* TG_(get_fn_node)(BB* bb)
                       (UWord)bb->offset, bb_addr(bb));
       }
 
-      fn->is_malloc  = (VG_(strcmp)(fn->name, "malloc")==0);
-      fn->is_realloc = (VG_(strcmp)(fn->name, "realloc")==0);
-      fn->is_free    = (VG_(strcmp)(fn->name, "free")==0);
-
       /* apply config options from function name patterns
        * given on command line */
       TG_(update_fn_config)(fn);
diff --git a/tracegrind/global.h b/tracegrind/global.h
index 8b0101b09..aee88bf7c 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -79,10 +79,6 @@ typedef enum {
    systime_nsec
 } Collect_Systime;
 
-typedef enum {
-   output_format_msgpack = 0
-} OutputFormat;
-
 /* Trace event types */
 typedef enum {
    TG_EV_MARKER = 0,
@@ -94,19 +90,9 @@ typedef enum {
 typedef struct _CommandLineOptions CommandLineOptions;
 struct _CommandLineOptions {
 
-  /* Dump format options */
+  /* Output options */
   const HChar* out_format;  /* Format string for tracegrind output file name */
-  Bool combine_dumps;       /* Dump trace parts into same file? */
-  Bool compress_strings;
-  Bool compress_events;
-  Bool compress_pos;
-  Bool mangle_names;
-  Bool compress_mangled;
-  Bool dump_line;
-  Bool dump_instr;
-  Bool dump_bb;
-  Bool dump_bbs;         /* Dump basic block information? */
-  
+
   /* Collection options */
   Bool separate_threads; /* Separate threads in dump? */
   Int  separate_callers; /* Separate dependent on how many callers? */
@@ -117,7 +103,6 @@ struct _CommandLineOptions {
   Bool collect_atstart;  /* Start in collecting state ? */
   Bool collect_jumps;    /* Collect (cond.) jumps in functions ? */
 
-  Bool collect_alloc;    /* Collect size of allocated memory */
   Collect_Systime collect_systime;  /* Collect time for system calls */
 
   Bool collect_bus;      /* Collect global bus events */
@@ -136,8 +121,6 @@ struct _CommandLineOptions {
   Int   verbose;
   ULong verbose_start;
 #endif
-
-  OutputFormat output_format;  /* csv or msgpack */
 };
 
 /*------------------------------------------------------------*/
@@ -202,7 +185,6 @@ typedef struct _CC          CC;
 typedef struct _BB          BB;
 typedef struct _BBCC        BBCC;
 typedef struct _jCC         jCC;
-typedef struct _fCC         fCC;
 typedef struct _fn_node     fn_node;
 typedef struct _file_node   file_node;
 typedef struct _obj_node    obj_node;
@@ -210,10 +192,8 @@ typedef struct _fn_config   fn_config;
 typedef struct _call_entry  call_entry;
 typedef struct _thread_info thread_info;
 
-/* Costs of event sets. Aliases to arrays of 64-bit values */
-typedef ULong* SimCost;  /* All events the simulator can produce */
-typedef ULong* UserCost;
-typedef ULong* FullCost; /* Simulator + User */
+/* Cost arrays: aliases to arrays of 64-bit event counters */
+typedef ULong* FullCost;
 
 
 /* The types of control flow changes that can happen between
@@ -256,7 +236,6 @@ struct _jCC {
   UInt jmp;         /* jump no. in source */
 
   ULong call_counter; /* no wraparound with 64 bit */
-  ULong creation_seq; /* creation order sequence number for correct dump order */
 
   FullCost cost; /* simulator + user counters */
 };
@@ -385,11 +364,8 @@ struct _BBCC {
     Context* cxt;          /* execution context of this BBCC */
     ThreadId tid;          /* only for assertion check purpose */
     UInt     rec_index;    /* Recursion index in rec->bbcc for this bbcc */
-    BBCC**   rec_array;    /* Variable sized array of pointers to 
+    BBCC**   rec_array;    /* Variable sized array of pointers to
 			    * recursion BBCCs. Shared. */
-    ULong    ret_counter;  /* how often returned from jccs of this bbcc;
-			    * used to check if a dump for this BBCC is needed */
-    
     BBCC*    next_bbcc;    /* Chain of BBCCs for same BB */
     BBCC*    lru_next_bbcc; /* BBCC executed next the last time */
     
@@ -405,10 +381,6 @@ struct _BBCC {
 };
 
 
-/* the <number> of fn_node, file_node and obj_node are for compressed dumping
- * and a index into the dump boolean table and fn_info_table
- */
-
 struct _fn_node {
   HChar*     name;
   UInt       number;
@@ -422,10 +394,6 @@ struct _fn_node {
   Bool obj_skip_checked : 1;
   Bool pop_on_jump : 1;
 
-  Bool is_malloc :1;
-  Bool is_realloc :1;
-  Bool is_free :1;
-
   Int  group;
   Int  separate_callers;
   Int  separate_recursions;
@@ -443,7 +411,6 @@ struct _fn_node {
 struct _file_node {
    HChar*     name;
    fn_node*   fns[N_FN_ENTRIES];
-   UInt       number;
    obj_node*  obj;
    file_node* next;
 };
@@ -593,9 +560,8 @@ struct _thread_info {
   call_stack calls;   /* context call arc stack */
   exec_stack states;  /* execution states interrupted by signals */
 
-  /* dump statistics */
-  FullCost lastdump_cost;    /* Cost at last dump */
-  FullCost sighandler_cost;
+  /* cost tracking */
+  FullCost lastdump_cost;    /* Cost at last total cost computation */
 
   /* CSV trace: per-thread snapshot of cost at last sample emission */
   FullCost last_sample_cost;
@@ -606,39 +572,6 @@ struct _thread_info {
   bbcc_hash bbccs;
 };
 
-/* Structs used for dumping */
-
-/* Address position inside of a BBCC:
- * This includes
- * - the address offset from the BB start address
- * - file/line from debug info for that address (can change inside a BB)
- */
-typedef struct _AddrPos AddrPos;
-struct _AddrPos {
-    Addr addr;
-    Addr bb_addr;
-    file_node* file;
-    UInt line;
-};
-
-/* a simulator cost entity that can be written out in one line */
-typedef struct _AddrCost AddrCost;
-struct _AddrCost {
-    AddrPos p;
-    SimCost cost;
-};
-
-/* A function in an execution context */
-typedef struct _FnPos FnPos;
-struct _FnPos {
-    file_node* file;
-    fn_node* fn;
-    obj_node* obj;
-    Context* cxt;
-    int rec_index;
-    UInt line;
-};
-
 /*------------------------------------------------------------*/
 /*--- Cache simulator interface                            ---*/
 /*------------------------------------------------------------*/
@@ -649,9 +582,7 @@ struct cachesim_if
     Bool (*parse_opt)(const HChar* arg);
     void (*post_clo_init)(void);
     void (*clear)(void);
-    void (*dump_desc)(VgFile *fp);
     void (*printstat)(Int,Int,Int);
-    void (*add_icost)(SimCost, BBCC*, InstrInfo*, ULong);
     void (*finish)(void);
     
     void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
@@ -678,8 +609,7 @@ struct cachesim_if
 #define EG_BC    4
 #define EG_BI    5
 #define EG_BUS   6
-#define EG_ALLOC 7
-#define EG_SYS   8
+#define EG_SYS   7
 
 struct event_sets {
     EventSet *base, *full;
@@ -754,10 +684,7 @@ void TG_(init_bbcc_hash)(bbcc_hash* bbccs);
 void TG_(copy_current_bbcc_hash)(bbcc_hash* dst);
 bbcc_hash* TG_(get_current_bbcc_hash)(void);
 void TG_(set_current_bbcc_hash)(bbcc_hash*);
-void TG_(forall_bbccs)(void (*func)(BBCC*));
-void TG_(zero_bbcc)(BBCC* bbcc);
 BBCC* TG_(get_bbcc)(BB* bb);
-BBCC* TG_(clone_bbcc)(BBCC* orig, Context* cxt, Int rec_index);
 void TG_(setup_bbcc)(BB* bb) VG_REGPARM(1);
 
 
diff --git a/tracegrind/jumps.c b/tracegrind/jumps.c
index d8ee30369..d74deba41 100644
--- a/tracegrind/jumps.c
+++ b/tracegrind/jumps.c
@@ -34,8 +34,7 @@
 
 static jcc_hash current_jccs;
 
-/* Global counter for jCC creation sequence to preserve chronological order */
-static ULong jcc_creation_counter = 0;
+
 
 void TG_(init_jcc_hash)(jcc_hash* jccs)
 {
@@ -151,7 +150,6 @@ static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to)
    jcc->to        = to;
    jcc->jmpkind   = jk_Call;
    jcc->call_counter = 0;
-   jcc->creation_seq = jcc_creation_counter++;
    jcc->cost = 0;
 
    /* insert into JCC chain of calling BBCC.
diff --git a/tracegrind/main.c b/tracegrind/main.c
index 1d9a6fc1f..a691178bb 100644
--- a/tracegrind/main.c
+++ b/tracegrind/main.c
@@ -2063,11 +2063,6 @@ void TG_(post_clo_init)(void)
    TG_DEBUG(1, "  call sep. : %d\n", TG_(clo).separate_callers);
    TG_DEBUG(1, "  rec. sep. : %d\n", TG_(clo).separate_recursions);
 
-   if (!TG_(clo).dump_line && !TG_(clo).dump_instr && !TG_(clo).dump_bb) {
-       VG_(message)(Vg_UserMsg, "Using source line as position.\n");
-       TG_(clo).dump_line = True;
-   }
-
    (*TG_(cachesim).post_clo_init)();
 
    TG_(init_eventsets)();
@@ -2102,7 +2097,7 @@ void TG_(pre_clo_init)(void)
 {
     VG_(details_name)            ("Tracegrind");
     VG_(details_version)         (NULL);
-    VG_(details_description)     ("a streaming CSV trace cache profiler");
+    VG_(details_description)     ("a streaming trace cache profiler");
     VG_(details_copyright_author)("Copyright (C) 2026, and GNU GPL'd, "
 				  "by CodSpeed Technology SAS. "
 				  "Based on Callgrind by Josef Weidendorfer et al.");
diff --git a/tracegrind/sim.c b/tracegrind/sim.c
index 68e6fa84a..ee7260818 100644
--- a/tracegrind/sim.c
+++ b/tracegrind/sim.c
@@ -1432,13 +1432,6 @@ void cachesim_clear(void)
 }
 
 
-static void cachesim_dump_desc(VgFile *fp)
-{
-  VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
-  VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
-  VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
-}
-
 static
 void cachesim_print_opts(void)
 {
@@ -1622,9 +1615,6 @@ void TG_(init_eventsets)(void)
     if (TG_(clo).collect_bus)
 	TG_(register_event_group)(EG_BUS, "Ge");
 
-    if (TG_(clo).collect_alloc)
-	TG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
-
     if (TG_(clo).collect_systime != systime_no) {
        if (TG_(clo).collect_systime == systime_nsec)
           TG_(register_event_group3)(EG_SYS, "sysCount", "sysTime", "sysCpuTime");
@@ -1639,7 +1629,7 @@ void TG_(init_eventsets)(void)
     TG_(sets).full = TG_(add_event_group2)(TG_(sets).base, EG_DR, EG_DW);
     TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_BC, EG_BI);
     TG_(sets).full = TG_(add_event_group) (TG_(sets).full, EG_BUS);
-    TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_ALLOC, EG_SYS);
+    TG_(sets).full = TG_(add_event_group) (TG_(sets).full, EG_SYS);
 
     TG_DEBUGIF(1) {
 	TG_DEBUG(1, "EventSets:\n");
@@ -1678,18 +1668,6 @@ void TG_(init_eventsets)(void)
 }
 
 
-/* this is called at dump time for every instruction executed */
-static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
-			       InstrInfo* ii, ULong exe_count)
-{
-    if (!TG_(clo).simulate_cache)
-	cost[ fullOffset(EG_IR) ] += exe_count;
-
-    if (ii->eventset)
-	TG_(add_and_zero_cost2)( TG_(sets).full, cost,
-				  ii->eventset, bbcc->cost + ii->cost_offset);
-}
-
 static
 void cachesim_finish(void)
 {
@@ -1706,9 +1684,7 @@ struct cachesim_if TG_(cachesim) = {
   .parse_opt     = cachesim_parse_opt,
   .post_clo_init = cachesim_post_clo_init,
   .clear         = cachesim_clear,
-  .dump_desc     = cachesim_dump_desc,
   .printstat     = cachesim_printstat,
-  .add_icost     = cachesim_add_icost,
   .finish        = cachesim_finish,
 
   /* these will be set by cachesim_post_clo_init */
diff --git a/tracegrind/threads.c b/tracegrind/threads.c
index 960eb864d..734bf3f48 100644
--- a/tracegrind/threads.c
+++ b/tracegrind/threads.c
@@ -112,9 +112,7 @@ thread_info* new_thread(void)
 
     /* event counters */
     t->lastdump_cost   = TG_(get_eventset_cost)( TG_(sets).full );
-    t->sighandler_cost = TG_(get_eventset_cost)( TG_(sets).full );
     TG_(init_cost)( TG_(sets).full, t->lastdump_cost );
-    TG_(init_cost)( TG_(sets).full, t->sighandler_cost );
 
     /* CSV trace: per-thread sample snapshot (allocated lazily in trace_emit_sample) */
     t->last_sample_cost = 0;
@@ -266,12 +264,10 @@ void TG_(post_signal)(ThreadId tid, Int sigNum)
 	TG_(current_fn_stack).top--;
     }
 
-    /* sum up costs */
+    /* zero signal handler costs before restoring previous context */
     TG_ASSERT(TG_(current_state).cost == es->cost);
-    TG_(add_and_zero_cost)( TG_(sets).full,
-			    thread[TG_(current_tid)]->sighandler_cost,
-			    TG_(current_state).cost );
-    
+    TG_(zero_cost)( TG_(sets).full, TG_(current_state).cost );
+
     /* restore previous context */
     es->sig = -1;
     current_states.sp--;

From ea18a5be2c7c3b6d262f54be35fe6309d0c7de0d Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 23:39:36 +0000
Subject: [PATCH 09/26] refactor: remove --fn-skip, --obj-skip, and
 --skip-direct-rec options

These callgrind-inherited options are unnecessary for tracegrind's
streaming trace model. Simplifies recursion depth tracking to always
increment/decrement unconditionally.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/bbcc.c      | 22 +---------------------
 tracegrind/callstack.c | 12 ++----------
 tracegrind/clo.c       | 28 ----------------------------
 tracegrind/fn.c        |  1 -
 tracegrind/global.h    |  4 ----
 5 files changed, 3 insertions(+), 64 deletions(-)

diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
index e9cb13596..b8c57a1ab 100644
--- a/tracegrind/bbcc.c
+++ b/tracegrind/bbcc.c
@@ -671,19 +671,6 @@ void TG_(setup_bbcc)(BB* bb)
   if (jmpkind == jk_Call) {
     fn_node* node = TG_(get_fn_node)(bb);
     skip = node->skip;
-    if (!skip && !node->obj_skip_checked){
-      HChar* obj_name = node->file->obj->name;
-      // VG_(printf)("  %s\n", obj_name);
-      for (int i=0; i<TG_(clo).objs_to_skip_count; i++) {
-        // VG_(printf)("     %s\n", TG_(clo).objs_to_skip[i]);
-        if (VG_(strcmp)(obj_name, TG_(clo).objs_to_skip[i]) == 0) {
-          node->skip = True;
-          skip = True;
-          break;
-        }
-      }
-      node->obj_skip_checked = True;
-    }
   }
 
   TG_DEBUGIF(1) {
@@ -789,14 +776,7 @@ void TG_(setup_bbcc)(BB* bb)
     level = *TG_(get_fn_entry)(top->number);
 
     if (delayed_push && !skip) {
-      if (TG_(clo).skip_direct_recursion) {
-        /* a call was detected, which means that the source BB != 0 */
-	TG_ASSERT(TG_(current_state).bbcc != 0);
-	/* only increment rec. level if called from different function */ 
-	if (TG_(current_state).bbcc->cxt->fn[0] != bbcc->cxt->fn[0])
-	  level++;
-      }
-      else level++;
+      level++;
     }
     if (level> top->separate_recursions)
       level = top->separate_recursions;
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index f81e01e0c..f8fecf1ec 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -202,11 +202,7 @@ void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
 	TG_ASSERT(jcc != 0);
 
 	pdepth = TG_(get_fn_entry)(to_fn->number);
-	if (TG_(clo).skip_direct_recursion) {
-	    /* only increment depth if another function is called */
-	  if (jcc->from->cxt->fn[0] != to_fn) (*pdepth)++;
-	}
-	else (*pdepth)++;
+	(*pdepth)++;
 
 	if (*pdepth>1)
 	  TG_(stat).rec_call_counter++;
@@ -326,11 +322,7 @@ void TG_(pop_call_stack)(void)
     if (jcc) {
 	fn_node* to_fn  = jcc->to->cxt->fn[0];
 	UInt* pdepth =  TG_(get_fn_entry)(to_fn->number);
-	if (TG_(clo).skip_direct_recursion) {
-	    /* only decrement depth if another function was called */
-	  if (jcc->from->cxt->fn[0] != to_fn) (*pdepth)--;
-	}
-	else (*pdepth)--;
+	(*pdepth)--;
 	depth = *pdepth;
 
 	/* add cost difference to sum */
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index a470300be..3bd96697e 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -44,7 +44,6 @@
 struct _fn_config {
     Int toggle_collect;
 
-    Int skip;    /* Handle CALL to this function as JMP (= Skip)? */
     Int group;   /* don't change caller dependency inside group !=0 */
     Int pop_on_jump; 
 
@@ -97,7 +96,6 @@ fn_config* new_fnc(void)
                                             sizeof(fn_config));
 
    fnc->toggle_collect = CONFIG_DEFAULT;
-   fnc->skip         = CONFIG_DEFAULT;
    fnc->pop_on_jump  = CONFIG_DEFAULT;
    fnc->group        = CONFIG_DEFAULT;
    fnc->separate_callers    = CONFIG_DEFAULT;
@@ -312,9 +310,6 @@ static void update_fn_config1(fn_node* fn, fn_config* fnc)
     if (fnc->toggle_collect != CONFIG_DEFAULT)
 	fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
 
-    if (fnc->skip != CONFIG_DEFAULT)
-	fn->skip = (fnc->skip == CONFIG_TRUE);
-
     if (fnc->pop_on_jump != CONFIG_DEFAULT)
 	fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE);
 
@@ -405,19 +400,6 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 
    else if VG_BOOL_CLO(arg, "--separate-threads", TG_(clo).separate_threads) {}
 
-   else if VG_STR_CLO(arg, "--fn-skip", tmp_str) {
-       fn_config* fnc = get_fnc(tmp_str);
-       fnc->skip = CONFIG_TRUE;
-   }
-   else if VG_STR_CLO(arg, "--obj-skip", tmp_str) {
-       HChar *obj_name = VG_(strdup)("cl.clo.pclo.1", tmp_str);
-       TG_(clo).objs_to_skip_count++;
-       TG_(clo).objs_to_skip = VG_(realloc)("cl.clo.pclo.2",
-                                             TG_(clo).objs_to_skip,
-                                             TG_(clo).objs_to_skip_count*sizeof(HChar*));
-       TG_(clo).objs_to_skip[TG_(clo).objs_to_skip_count-1] = obj_name;
-   }
-
    else if VG_STR_CLO(arg, "--toggle-collect", tmp_str) {
        fn_config* fnc = get_fnc(tmp_str);
        fnc->toggle_collect = CONFIG_TRUE;
@@ -482,9 +464,6 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 
    else if VG_STR_CLO(arg, "--tracegrind-out-file", TG_(clo).out_format) {}
 
-   else if VG_BOOL_CLO(arg, "--skip-direct-rec",
-                            TG_(clo).skip_direct_recursion) {}
-
    else if VG_XACT_CLO(arg, "--collect-systime=no",
                        TG_(clo).collect_systime, systime_no) {}
    else if VG_XACT_CLO(arg, "--collect-systime=msec",
@@ -547,9 +526,6 @@ void TG_(print_usage)(void)
 "    --separate-recs=<n>       Separate function recursions up to level [2]\n"
 "    --separate-recs<n>=<f>    Separate <n> recursions for function <f>\n"
 "    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
-"    --skip-direct-rec=no|yes  Ignore direct recursions? [yes]\n"
-"    --fn-skip=<function>      Ignore calls to/from function?\n"
-"    --obj-skip=<object>       Ignore calls to/from object?\n"
 #if TG_EXPERIMENTAL
 "    --fn-group<no>=<func>     Put function into separation group <no>\n"
 #endif
@@ -598,8 +574,6 @@ void TG_(set_clo_defaults)(void)
   TG_(clo).skip_plt         = True;
   TG_(clo).separate_callers = 0;
   TG_(clo).separate_recursions = 2;
-  TG_(clo).skip_direct_recursion = False;
-
   /* Instrumentation */
   TG_(clo).instrument_atstart = True;
   TG_(clo).simulate_cache = False;
@@ -607,8 +581,6 @@ void TG_(set_clo_defaults)(void)
 
   /* Call graph */
   TG_(clo).pop_on_jump = False;
-  TG_(clo).objs_to_skip_count = 0;
-  TG_(clo).objs_to_skip = 0;
 
 #if TG_ENABLE_DEBUG
   TG_(clo).verbose = 0;
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index 1e866ec57..e84e73cd9 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -448,7 +448,6 @@ fn_node* new_fn_node(const HChar *fnname,
 
     fn->toggle_collect = False;
     fn->skip         = False;
-    fn->obj_skip_checked = False;
     fn->pop_on_jump  = TG_(clo).pop_on_jump;
     fn->group        = 0;
     fn->separate_callers    = TG_(clo).separate_callers;
diff --git a/tracegrind/global.h b/tracegrind/global.h
index aee88bf7c..d6a3d2b01 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -98,7 +98,6 @@ struct _CommandLineOptions {
   Int  separate_callers; /* Separate dependent on how many callers? */
   Int  separate_recursions; /* Max level of recursions to separate */
   Bool skip_plt;         /* Skip functions in PLT section? */
-  Bool skip_direct_recursion; /* Increment direct recursions the level? */
 
   Bool collect_atstart;  /* Start in collecting state ? */
   Bool collect_jumps;    /* Collect (cond.) jumps in functions ? */
@@ -114,8 +113,6 @@ struct _CommandLineOptions {
 
   /* Call graph generation */
   Bool pop_on_jump;       /* Handle a jump between functions as ret+call */
-  Int objs_to_skip_count; /* Number of objects to skip */
-  HChar** objs_to_skip;  /* List of objects to skip */
 
 #if TG_ENABLE_DEBUG
   Int   verbose;
@@ -391,7 +388,6 @@ struct _fn_node {
 
   Bool toggle_collect :1;
   Bool skip :1;
-  Bool obj_skip_checked : 1;
   Bool pop_on_jump : 1;
 
   Int  group;

From c699fd188ebf2ea9430f1ef01092169457cf0b5b Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Fri, 6 Feb 2026 23:52:31 +0000
Subject: [PATCH 10/26] feat(tracegrind): add regression tests and CI
 integration

Add vg_regtest-based regression tests covering basic tracing, markers,
instrumentation toggle, toggle collect, call chains, inlining behavior,
and schema validation. Extend CI matrix to run tracegrind tests alongside
callgrind on both Ubuntu 22.04 and 24.04.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml                      | 21 +++++---
 .gitignore                                    |  1 +
 ...egrind-analyzer.py => tracegrind-analyzer} |  0
 tracegrind/tests/Makefile.am                  | 36 ++++++++++++-
 tracegrind/tests/filter_stderr                | 36 +++++++++++++
 tracegrind/tests/filter_trace                 | 54 +++++++++++++++++++
 tracegrind/tests/test_basic.c                 | 11 ++++
 tracegrind/tests/test_basic.post.exp          | 18 +++++++
 tracegrind/tests/test_basic.stderr.exp        |  6 +++
 tracegrind/tests/test_basic.vgtest            |  5 ++
 tracegrind/tests/test_foo_bar_baz.c           | 23 ++++++++
 tracegrind/tests/test_foo_bar_baz.post.exp    | 23 ++++++++
 tracegrind/tests/test_foo_bar_baz.stderr.exp  |  6 +++
 tracegrind/tests/test_foo_bar_baz.vgtest      |  5 ++
 tracegrind/tests/test_inline.c                | 25 +++++++++
 tracegrind/tests/test_inline.post.exp         | 15 ++++++
 tracegrind/tests/test_inline.stderr.exp       |  6 +++
 tracegrind/tests/test_inline.vgtest           |  5 ++
 tracegrind/tests/test_instr_toggle.c          | 18 +++++++
 tracegrind/tests/test_instr_toggle.post.exp   | 19 +++++++
 tracegrind/tests/test_instr_toggle.stderr.exp |  6 +++
 tracegrind/tests/test_instr_toggle.vgtest     |  5 ++
 tracegrind/tests/test_marker.c                | 15 ++++++
 tracegrind/tests/test_marker.post.exp         | 13 +++++
 tracegrind/tests/test_marker.stderr.exp       |  6 +++
 tracegrind/tests/test_marker.vgtest           |  5 ++
 tracegrind/tests/test_schema.post.exp         | 10 ++++
 tracegrind/tests/test_schema.stderr.exp       |  6 +++
 tracegrind/tests/test_schema.vgtest           |  5 ++
 tracegrind/tests/test_toggle_collect.c        | 23 ++++++++
 tracegrind/tests/test_toggle_collect.post.exp | 18 +++++++
 .../tests/test_toggle_collect.stderr.exp      |  6 +++
 tracegrind/tests/test_toggle_collect.vgtest   |  5 ++
 33 files changed, 447 insertions(+), 9 deletions(-)
 rename tracegrind/scripts/{tracegrind-analyzer.py => tracegrind-analyzer} (100%)
 create mode 100755 tracegrind/tests/filter_stderr
 create mode 100755 tracegrind/tests/filter_trace
 create mode 100644 tracegrind/tests/test_basic.c
 create mode 100644 tracegrind/tests/test_basic.post.exp
 create mode 100644 tracegrind/tests/test_basic.stderr.exp
 create mode 100644 tracegrind/tests/test_basic.vgtest
 create mode 100644 tracegrind/tests/test_foo_bar_baz.c
 create mode 100644 tracegrind/tests/test_foo_bar_baz.post.exp
 create mode 100644 tracegrind/tests/test_foo_bar_baz.stderr.exp
 create mode 100644 tracegrind/tests/test_foo_bar_baz.vgtest
 create mode 100644 tracegrind/tests/test_inline.c
 create mode 100644 tracegrind/tests/test_inline.post.exp
 create mode 100644 tracegrind/tests/test_inline.stderr.exp
 create mode 100644 tracegrind/tests/test_inline.vgtest
 create mode 100644 tracegrind/tests/test_instr_toggle.c
 create mode 100644 tracegrind/tests/test_instr_toggle.post.exp
 create mode 100644 tracegrind/tests/test_instr_toggle.stderr.exp
 create mode 100644 tracegrind/tests/test_instr_toggle.vgtest
 create mode 100644 tracegrind/tests/test_marker.c
 create mode 100644 tracegrind/tests/test_marker.post.exp
 create mode 100644 tracegrind/tests/test_marker.stderr.exp
 create mode 100644 tracegrind/tests/test_marker.vgtest
 create mode 100644 tracegrind/tests/test_schema.post.exp
 create mode 100644 tracegrind/tests/test_schema.stderr.exp
 create mode 100644 tracegrind/tests/test_schema.vgtest
 create mode 100644 tracegrind/tests/test_toggle_collect.c
 create mode 100644 tracegrind/tests/test_toggle_collect.post.exp
 create mode 100644 tracegrind/tests/test_toggle_collect.stderr.exp
 create mode 100644 tracegrind/tests/test_toggle_collect.vgtest

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3ae16aa5d..8a4ae6db7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  test-callgrind:
+  test:
     strategy:
       matrix:
         runner:
@@ -16,6 +16,9 @@ jobs:
             ubuntu-version: 22.04
           - platform: ubuntu-24.04
             ubuntu-version: 24.04
+        tool:
+          - callgrind
+          - tracegrind
 
     runs-on: ${{ matrix.runner.platform }}
 
@@ -32,7 +35,7 @@ jobs:
           path-exclude /usr/share/man/*
           path-exclude /usr/share/info/*
           EOF
-          
+
       - name: Update apt-get cache
         run: sudo apt-get update
 
@@ -51,6 +54,10 @@ jobs:
             docbook-xml \
             xsltproc
 
+      - name: Install uv
+        if: matrix.tool == 'tracegrind'
+        uses: astral-sh/setup-uv@v7
+
       - name: Run autogen
         run: ./autogen.sh
 
@@ -63,11 +70,11 @@ jobs:
       - name: Build test dependencies
         run: |
           make -C tests arch_test os_test true
-          make -C callgrind/tests check
+          make -C ${{ matrix.tool }}/tests check
 
-      - name: Run Callgrind tests
+      - name: Run tests
         run: |
-          cd callgrind/tests
+          cd ${{ matrix.tool }}/tests
           TESTS=$(ls *.vgtest | grep -v bug497723.vgtest)
           perl ../../tests/vg_regtest --valgrind=../../vg-in-place $TESTS
 
@@ -75,5 +82,5 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v4
         with:
-          name: callgrind-test-logs-${{ matrix.runner.ubuntu-version }}
-          path: callgrind/tests/*.log
+          name: ${{ matrix.tool }}-test-logs-${{ matrix.runner.ubuntu-version }}
+          path: ${{ matrix.tool }}/tests/*.log
diff --git a/.gitignore b/.gitignore
index 6a2f18e30..132e768e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -187,6 +187,7 @@
 /tracegrind/tests/Makefile.in
 /tracegrind/tests/tracegrind.out.*
 /tracegrind/tests/fibo
+/tracegrind/tests/*.bin
 
 # /coregrind/
 /coregrind/*.a
diff --git a/tracegrind/scripts/tracegrind-analyzer.py b/tracegrind/scripts/tracegrind-analyzer
similarity index 100%
rename from tracegrind/scripts/tracegrind-analyzer.py
rename to tracegrind/scripts/tracegrind-analyzer
diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
index 5351eb577..ec5fcd85b 100644
--- a/tracegrind/tests/Makefile.am
+++ b/tracegrind/tests/Makefile.am
@@ -1,3 +1,35 @@
-dist_noinst_SCRIPTS =
 
-EXTRA_DIST =
+include $(top_srcdir)/Makefile.tool-tests.am
+
+SUBDIRS = .
+DIST_SUBDIRS = .
+
+dist_noinst_SCRIPTS = filter_stderr filter_trace
+
+check_PROGRAMS = \
+	test_basic.bin \
+	test_marker.bin \
+	test_instr_toggle.bin \
+	test_toggle_collect.bin \
+	test_foo_bar_baz.bin \
+	test_inline.bin
+
+AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
+AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
+
+test_basic_bin_SOURCES = test_basic.c
+test_marker_bin_SOURCES = test_marker.c
+test_instr_toggle_bin_SOURCES = test_instr_toggle.c
+test_toggle_collect_bin_SOURCES = test_toggle_collect.c
+test_foo_bar_baz_bin_SOURCES = test_foo_bar_baz.c
+test_inline_bin_SOURCES = test_inline.c
+test_inline_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+
+EXTRA_DIST = \
+	test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \
+	test_marker.vgtest test_marker.stderr.exp test_marker.post.exp \
+	test_instr_toggle.vgtest test_instr_toggle.stderr.exp test_instr_toggle.post.exp \
+	test_toggle_collect.vgtest test_toggle_collect.stderr.exp test_toggle_collect.post.exp \
+	test_foo_bar_baz.vgtest test_foo_bar_baz.stderr.exp test_foo_bar_baz.post.exp \
+	test_inline.vgtest test_inline.stderr.exp test_inline.post.exp \
+	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/filter_stderr b/tracegrind/tests/filter_stderr
new file mode 100755
index 000000000..67355a7b2
--- /dev/null
+++ b/tracegrind/tests/filter_stderr
@@ -0,0 +1,36 @@
+#! /bin/sh
+
+dir=`dirname $0`
+
+$dir/../../tests/filter_stderr_basic                |
+
+# Remove "Tracegrind, ..." line and the following copyright line.
+sed "/^Tracegrind, a call-graph generating cache profiler/ , /./ d" |
+
+# Remove pointer to tracegrind_control
+sed "/^For interactive control,.*$/d" |
+
+# Remove numbers from "Collected" line
+sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |
+
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/'  |
+
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+
+# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
+perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
+
+# Remove CPUID warnings lines for P4s and other machines
+sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
+sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
+sed "/warning: L3 cache found, using its data for the LL simulation./d" |
+sed "/warning: L4 cache found, using its data for the LL simulation./d" |
+sed "/Warning: Cannot auto-detect cache config, using defaults./d" |
+sed "/Run with -v to see./d" |
+sed "/warning: specified LL cache: line_size .*$/d" |
+sed "/warning: simulated LL cache: line_size .*$/d" |
+
+# Remove trace output file path messages
+sed "/^Trace output to /d"
diff --git a/tracegrind/tests/filter_trace b/tracegrind/tests/filter_trace
new file mode 100755
index 000000000..6e2065baf
--- /dev/null
+++ b/tracegrind/tests/filter_trace
@@ -0,0 +1,54 @@
+#!/bin/sh
+#
+# Filter tracegrind trace output (from tracegrind-analyzer)
+# to normalize machine-dependent values for regression testing.
+#
+
+# Normalize format/schema version numbers
+sed 's/^Format Version: [0-9]\+$/Format Version: N/' |
+sed 's/^Schema Version: [0-9]\+$/Schema Version: N/' |
+
+# Normalize object paths: replace full path to test binary with just the basename
+# e.g. obj=/home/user/valgrind/tracegrind/tests/test_marker -> obj=test_marker
+sed 's|obj=[^ |]*[/]||g' |
+
+# Normalize file paths: replace full source paths with just the basename
+# e.g. file=/home/user/.../test_marker.c -> file=test_marker.c
+sed 's|file=[^ |]*[/]||g' |
+
+# Normalize function address/stats that vary: Ir counts
+# Replace Ir=<number> with Ir=N
+sed 's|Ir=[0-9]\+|Ir=N|g' |
+
+# Remove the separator line
+sed '/^-\{10,\}$/d' |
+
+# Normalize "Total rows:" count
+sed 's/^Total rows: [0-9,]\+$/Total rows: N/' |
+
+# Normalize "Showing X of Y rows"
+sed 's/^Showing [0-9,]\+ of [0-9,]\+ rows$/Showing N of N rows/' |
+
+# Normalize "Sequence range:" numbers
+sed 's/^Sequence range: [0-9,]\+ - [0-9,]\+$/Sequence range: N - N/' |
+
+# Normalize event count percentages in stats
+sed 's/\([0-9,]\+\) ([0-9.]\+%)/N (P%)/g' |
+
+# Normalize "Threads: N ([...])"
+sed 's/^Threads: \([0-9]\+\) (\[.*\])/Threads: \1/' |
+
+# Remove "Top 10 functions" section (platform-dependent)
+sed '/^Top 10 functions/,/^$/d' |
+
+# Remove "Fork events" section (platform-dependent)
+sed '/^Fork events/,/^$/d' |
+
+# Normalize seq numbers in raw arrays: [1234, ...] -> [N, ...]
+sed 's/^\[\([0-9]\+\),/[N,/g' |
+
+# Normalize seq=<number> in formatted output
+sed 's/seq=[0-9]\+/seq=N/g' |
+
+# Strip GCC optimization suffixes from function names (e.g. .constprop.0, .isra.0, .part.0)
+sed 's/\.\(constprop\|isra\|part\|cold\|lto_priv\)\.[0-9]*//g'
diff --git a/tracegrind/tests/test_basic.c b/tracegrind/tests/test_basic.c
new file mode 100644
index 000000000..971ad25c7
--- /dev/null
+++ b/tracegrind/tests/test_basic.c
@@ -0,0 +1,11 @@
+#include "tracegrind.h"
+
+static int factorial(int n) {
+    if (n <= 1) return 1;
+    return n * factorial(n - 1);
+}
+
+int main(void) {
+    int result = factorial(5);
+    return result != 120;
+}
diff --git a/tracegrind/tests/test_basic.post.exp b/tracegrind/tests/test_basic.post.exp
new file mode 100644
index 000000000..173b52f57
--- /dev/null
+++ b/tracegrind/tests/test_basic.post.exp
@@ -0,0 +1,18 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
+Total rows: N
+
+Events by type:
+  ENTER: N (P%)
+  EXIT: N (P%)
+
+Threads: 1
+Sequence range: N - N
diff --git a/tracegrind/tests/test_basic.stderr.exp b/tracegrind/tests/test_basic.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_basic.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_basic.vgtest b/tracegrind/tests/test_basic.vgtest
new file mode 100644
index 000000000..5c95483ea
--- /dev/null
+++ b/tracegrind/tests/test_basic.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_basic.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_basic
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_basic.msgpack.lz4 --stats | ./filter_trace
+cleanup: rm -f tracegrind.out.test_basic.msgpack.lz4
diff --git a/tracegrind/tests/test_foo_bar_baz.c b/tracegrind/tests/test_foo_bar_baz.c
new file mode 100644
index 000000000..e9f7a6783
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.c
@@ -0,0 +1,23 @@
+#include "tracegrind.h"
+
+static int __attribute__((noinline)) baz(int n) {
+    return n * 2;
+}
+
+static int __attribute__((noinline)) bar(int n) {
+    return baz(n) + 1;
+}
+
+static int __attribute__((noinline)) foo(int n) {
+    return bar(n) + bar(n + 1);
+}
+
+int main(void) {
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = foo(3);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+
+    return result != (baz(3) + 1 + baz(4) + 1);
+}
diff --git a/tracegrind/tests/test_foo_bar_baz.post.exp b/tracegrind/tests/test_foo_bar_baz.post.exp
new file mode 100644
index 000000000..3b6b2d2d3
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.post.exp
@@ -0,0 +1,23 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=foo | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=foo | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_foo_bar_baz.stderr.exp b/tracegrind/tests/test_foo_bar_baz.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_foo_bar_baz.vgtest b/tracegrind/tests/test_foo_bar_baz.vgtest
new file mode 100644
index 000000000..c1cfaeefe
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_foo_bar_baz.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_foo_bar_baz --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_foo_bar_baz.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_foo_bar_baz.msgpack.lz4
diff --git a/tracegrind/tests/test_inline.c b/tracegrind/tests/test_inline.c
new file mode 100644
index 000000000..fb295f73f
--- /dev/null
+++ b/tracegrind/tests/test_inline.c
@@ -0,0 +1,25 @@
+#include "tracegrind.h"
+
+/* Force inlining - these should NOT appear as ENTER/EXIT in the trace */
+static inline __attribute__((always_inline)) int inlined_add(int a, int b) {
+    return a + b;
+}
+
+static inline __attribute__((always_inline)) int inlined_mul(int a, int b) {
+    return a * b;
+}
+
+/* Prevent inlining - these SHOULD appear as ENTER/EXIT in the trace */
+static int __attribute__((noinline)) not_inlined_work(int n) {
+    return inlined_add(n, inlined_mul(n, 2));
+}
+
+int main(void) {
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = not_inlined_work(5);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+
+    return result != 15;
+}
diff --git a/tracegrind/tests/test_inline.post.exp b/tracegrind/tests/test_inline.post.exp
new file mode 100644
index 000000000..1aa9c05b2
--- /dev/null
+++ b/tracegrind/tests/test_inline.post.exp
@@ -0,0 +1,15 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=not_inlined_work | test_inline | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=not_inlined_work | test_inline | ... | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_inline.stderr.exp b/tracegrind/tests/test_inline.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_inline.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_inline.vgtest b/tracegrind/tests/test_inline.vgtest
new file mode 100644
index 000000000..f6ab09838
--- /dev/null
+++ b/tracegrind/tests/test_inline.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_inline.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_inline --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_inline.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_inline.msgpack.lz4
diff --git a/tracegrind/tests/test_instr_toggle.c b/tracegrind/tests/test_instr_toggle.c
new file mode 100644
index 000000000..e47767d2c
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.c
@@ -0,0 +1,18 @@
+#include "tracegrind.h"
+
+static int __attribute__((noinline)) fibo(int n) {
+    if (n <= 1) return n;
+    return fibo(n - 1) + fibo(n - 2);
+}
+
+int main(void) {
+    /* Instrumentation is off (--instr-atstart=no).
+       Only the fibo(2) call will be traced. */
+    TRACEGRIND_ADD_MARKER("before-fibo");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = fibo(2);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("after-fibo");
+
+    return result != 1;
+}
diff --git a/tracegrind/tests/test_instr_toggle.post.exp b/tracegrind/tests/test_instr_toggle.post.exp
new file mode 100644
index 000000000..dba045bc2
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.post.exp
@@ -0,0 +1,19 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=before-fibo
+seq=N | tid=1 | event=ENTER | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=after-fibo
diff --git a/tracegrind/tests/test_instr_toggle.stderr.exp b/tracegrind/tests/test_instr_toggle.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_instr_toggle.vgtest b/tracegrind/tests/test_instr_toggle.vgtest
new file mode 100644
index 000000000..75adf7aba
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_instr_toggle.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_instr_toggle --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_instr_toggle.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_instr_toggle.msgpack.lz4
diff --git a/tracegrind/tests/test_marker.c b/tracegrind/tests/test_marker.c
new file mode 100644
index 000000000..76a5d72cc
--- /dev/null
+++ b/tracegrind/tests/test_marker.c
@@ -0,0 +1,15 @@
+#include "tracegrind.h"
+
+static int compute(int n) {
+    int sum = 0;
+    for (int i = 0; i < n; i++)
+        sum += i * i;
+    return sum;
+}
+
+int main(void) {
+    TRACEGRIND_ADD_MARKER("start-work");
+    int result = compute(1000);
+    TRACEGRIND_ADD_MARKER("end-work");
+    return result == 0;
+}
diff --git a/tracegrind/tests/test_marker.post.exp b/tracegrind/tests/test_marker.post.exp
new file mode 100644
index 000000000..b3b813b1a
--- /dev/null
+++ b/tracegrind/tests/test_marker.post.exp
@@ -0,0 +1,13 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
+Showing N of N rows
+[N, 1, 0, 'start-work']
+[N, 1, 0, 'end-work']
diff --git a/tracegrind/tests/test_marker.stderr.exp b/tracegrind/tests/test_marker.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_marker.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_marker.vgtest b/tracegrind/tests/test_marker.vgtest
new file mode 100644
index 000000000..fe3b45a0a
--- /dev/null
+++ b/tracegrind/tests/test_marker.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_marker.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_marker
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_marker.msgpack.lz4 --event MARKER --raw | ./filter_trace
+cleanup: rm -f tracegrind.out.test_marker.msgpack.lz4
diff --git a/tracegrind/tests/test_schema.post.exp b/tracegrind/tests/test_schema.post.exp
new file mode 100644
index 000000000..86f5f9a21
--- /dev/null
+++ b/tracegrind/tests/test_schema.post.exp
@@ -0,0 +1,10 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
diff --git a/tracegrind/tests/test_schema.stderr.exp b/tracegrind/tests/test_schema.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_schema.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_schema.vgtest b/tracegrind/tests/test_schema.vgtest
new file mode 100644
index 000000000..4f96bd7df
--- /dev/null
+++ b/tracegrind/tests/test_schema.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_basic.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_schema
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_schema.msgpack.lz4 --schema | ./filter_trace
+cleanup: rm -f tracegrind.out.test_schema.msgpack.lz4
diff --git a/tracegrind/tests/test_toggle_collect.c b/tracegrind/tests/test_toggle_collect.c
new file mode 100644
index 000000000..4d7de4ceb
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.c
@@ -0,0 +1,23 @@
+#include "tracegrind.h"
+
+static int work(int n) {
+    int sum = 0;
+    for (int i = 0; i < n; i++)
+        sum += i;
+    return sum;
+}
+
+int main(void) {
+    /* Collection on by default, do some traced work */
+    int result = work(10);
+
+    /* Toggle collection off */
+    TRACEGRIND_TOGGLE_COLLECT;
+    result += work(20);  /* not collected */
+
+    /* Toggle collection back on */
+    TRACEGRIND_TOGGLE_COLLECT;
+    result += work(30);  /* collected again */
+
+    return result == 0;
+}
diff --git a/tracegrind/tests/test_toggle_collect.post.exp b/tracegrind/tests/test_toggle_collect.post.exp
new file mode 100644
index 000000000..173b52f57
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.post.exp
@@ -0,0 +1,18 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+
+Total rows: N
+
+Events by type:
+  ENTER: N (P%)
+  EXIT: N (P%)
+
+Threads: 1
+Sequence range: N - N
diff --git a/tracegrind/tests/test_toggle_collect.stderr.exp b/tracegrind/tests/test_toggle_collect.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_toggle_collect.vgtest b/tracegrind/tests/test_toggle_collect.vgtest
new file mode 100644
index 000000000..a0178f7eb
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_toggle_collect.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_toggle_collect
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_toggle_collect.msgpack.lz4 --stats | ./filter_trace
+cleanup: rm -f tracegrind.out.test_toggle_collect.msgpack.lz4

From 39d13466c5be8a33876d15f61a7fc13f57a7c5fe Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 01:54:24 +0000
Subject: [PATCH 11/26] feat(tracegrind): add ENTER_INLINED/EXIT_INLINED events
 for inline function tracking

Track inlined function transitions at the BB level using Valgrind's
debug info API. This bumps the trace format to v3 with two new event
types (4=ENTER_INLINED, 5=EXIT_INLINED), updates the analyzer script
to handle them, and adds regression tests for enter and nested inlined
scenarios.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/bb.c                               |   1 +
 tracegrind/bbcc.c                             |  15 +++
 tracegrind/callstack.c                        |  16 +++
 tracegrind/dump.c                             | 108 +++++++++++++++++-
 tracegrind/fn.c                               |   8 ++
 tracegrind/global.h                           |  45 +++++---
 tracegrind/scripts/tracegrind-analyzer        |  13 ++-
 tracegrind/sim.c                              |   4 -
 tracegrind/tests/Makefile.am                  |  10 +-
 tracegrind/tests/filter_stderr                |   4 +-
 tracegrind/tests/test_basic.post.exp          |   3 +
 tracegrind/tests/test_enter_inlined.c         |  32 ++++++
 tracegrind/tests/test_enter_inlined.post.exp  |  19 +++
 .../tests/test_enter_inlined.stderr.exp       |   6 +
 tracegrind/tests/test_enter_inlined.vgtest    |   5 +
 tracegrind/tests/test_foo_bar_baz.post.exp    |  22 ++--
 tracegrind/tests/test_inline.post.exp         |   6 +-
 tracegrind/tests/test_instr_toggle.post.exp   |  14 ++-
 tracegrind/tests/test_marker.post.exp         |   2 +
 tracegrind/tests/test_nested_inlined.c        |  47 ++++++++
 tracegrind/tests/test_nested_inlined.post.exp |  21 ++++
 .../tests/test_nested_inlined.stderr.exp      |   6 +
 tracegrind/tests/test_nested_inlined.vgtest   |   5 +
 tracegrind/tests/test_schema.post.exp         |   2 +
 tracegrind/tests/test_toggle_collect.post.exp |   3 +
 25 files changed, 364 insertions(+), 53 deletions(-)
 create mode 100644 tracegrind/tests/test_enter_inlined.c
 create mode 100644 tracegrind/tests/test_enter_inlined.post.exp
 create mode 100644 tracegrind/tests/test_enter_inlined.stderr.exp
 create mode 100644 tracegrind/tests/test_enter_inlined.vgtest
 create mode 100644 tracegrind/tests/test_nested_inlined.c
 create mode 100644 tracegrind/tests/test_nested_inlined.post.exp
 create mode 100644 tracegrind/tests/test_nested_inlined.stderr.exp
 create mode 100644 tracegrind/tests/test_nested_inlined.vgtest

diff --git a/tracegrind/bb.c b/tracegrind/bb.c
index 32f5a6c7c..e34021004 100644
--- a/tracegrind/bb.c
+++ b/tracegrind/bb.c
@@ -143,6 +143,7 @@ static BB* new_bb(obj_node* obj, PtrdiffT offset,
    bb->fn          = 0;
    bb->line        = 0;
    bb->is_entry    = 0;
+   bb->inl_fn      = NULL;
    bb->bbcc_list   = 0;
    bb->last_bbcc   = 0;
 
diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
index b8c57a1ab..a511f6bc5 100644
--- a/tracegrind/bbcc.c
+++ b/tracegrind/bbcc.c
@@ -830,6 +830,21 @@ void TG_(setup_bbcc)(BB* bb)
   }
   
   TG_(current_state).bbcc = bbcc;
+
+  /* Check for inline function transition */
+  if (TG_(current_state).collect) {
+      thread_info* ti = TG_(get_current_thread)();
+      if (ti && bb->inl_fn != ti->cur_inl_fn) {
+          if (ti->cur_inl_fn != NULL) {
+              TG_(trace_emit_exit_inlined)(TG_(current_tid), bb, ti->cur_inl_fn);
+          }
+          if (bb->inl_fn != NULL) {
+              TG_(trace_emit_enter_inlined)(TG_(current_tid), bb);
+          }
+          ti->cur_inl_fn = bb->inl_fn;
+      }
+  }
+
   /* Even though this will be set in instrumented code directly before
    * side exits, it needs to be set to 0 here in case an exception
    * happens in first instructions of the BB */
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index f8fecf1ec..dd694b493 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -234,6 +234,14 @@ void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
 
     /* Emit trace sample on function entry */
     if (!skip && TG_(current_state).collect) {
+	/* Emit EXIT_INLINED if we're entering a new function while inside inlined code */
+	thread_info* ti = TG_(get_current_thread)();
+	if (ti && ti->cur_inl_fn != NULL && TG_(current_state).bbcc) {
+	    TG_(trace_emit_exit_inlined)(TG_(current_tid),
+					 TG_(current_state).bbcc->bb,
+					 ti->cur_inl_fn);
+	    ti->cur_inl_fn = NULL;
+	}
 	fn_node* to_fn = to->cxt->fn[0];
 	TG_(trace_emit_sample)(TG_(current_tid), True, to_fn);
     }
@@ -335,6 +343,14 @@ void TG_(pop_call_stack)(void)
 
 	/* Emit trace sample on function exit */
 	if (TG_(current_state).collect) {
+	    /* Emit EXIT_INLINED if we're leaving while inside inlined code */
+	    thread_info* ti = TG_(get_current_thread)();
+	    if (ti && ti->cur_inl_fn != NULL && TG_(current_state).bbcc) {
+		TG_(trace_emit_exit_inlined)(TG_(current_tid),
+					     TG_(current_state).bbcc->bb,
+					     ti->cur_inl_fn);
+		ti->cur_inl_fn = NULL;
+	    }
 	    TG_(trace_emit_sample)(TG_(current_tid), False, to_fn);
 	}
 
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index a89b54a87..811b5c5ae 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -116,7 +116,7 @@ static void msgpack_write_header(void)
 
     /* version */
     msgpack_write_key(&hdr, "version");
-    msgpack_write_uint(&hdr, 2);
+    msgpack_write_uint(&hdr, 3);
 
     /* format */
     msgpack_write_key(&hdr, "format");
@@ -124,7 +124,7 @@ static void msgpack_write_header(void)
 
     /* event_schemas - discriminated union: each event type has its own schema */
     msgpack_write_key(&hdr, "event_schemas");
-    msgpack_write_map_header(&hdr, 4);  /* 4 event types: MARKER, ENTER, EXIT, FORK */
+    msgpack_write_map_header(&hdr, 6);  /* 6 event types: MARKER, ENTER, EXIT, FORK, ENTER_INLINED, EXIT_INLINED */
 
     /* Event type 0 (MARKER) schema */
     msgpack_write_key(&hdr, "0");
@@ -156,6 +156,20 @@ static void msgpack_write_header(void)
     msgpack_write_str(&hdr, "event", -1);
     msgpack_write_str(&hdr, "child_pid", -1);
 
+    /* Event type 4 (ENTER_INLINED) schema - same columns as ENTER/EXIT */
+    msgpack_write_key(&hdr, "4");
+    msgpack_write_array_header(&hdr, mp_state.ncols);
+    for (Int i = 0; i < mp_state.ncols; i++) {
+        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+    }
+
+    /* Event type 5 (EXIT_INLINED) schema - same columns as ENTER/EXIT */
+    msgpack_write_key(&hdr, "5");
+    msgpack_write_array_header(&hdr, mp_state.ncols);
+    for (Int i = 0; i < mp_state.ncols; i++) {
+        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+    }
+
     /* Compress and write header chunk */
     SizeT src_size = hdr.size;
     SizeT dst_capacity = tg_lz4_compress_bound(src_size);
@@ -164,8 +178,8 @@ static void msgpack_write_header(void)
     SizeT compressed_size = tg_lz4_compress(
         compressed, dst_capacity, hdr.data, src_size);
 
-    /* Magic + version (8 bytes): "TGMP" + version(4) - version 2 */
-    UChar magic[8] = {'T', 'G', 'M', 'P', 0x02, 0x00, 0x00, 0x00};
+    /* Magic + version (8 bytes): "TGMP" + version(4) - version 3 */
+    UChar magic[8] = {'T', 'G', 'M', 'P', 0x03, 0x00, 0x00, 0x00};
     VG_(write)(TG_(trace_out).fd, magic, 8);
 
     /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
@@ -439,6 +453,92 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
                     deltas, es->size);
 }
 
+void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb)
+{
+    Int i;
+
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
+
+    thread_info* ti = TG_(get_current_thread)();
+    if (!ti) return;
+
+    EventSet* es = TG_(sets).full;
+    FullCost current_cost = TG_(current_state).cost;
+
+    if (!ti->last_sample_cost) {
+        ti->last_sample_cost = TG_(get_eventset_cost)(es);
+        TG_(init_cost)(es, ti->last_sample_cost);
+    }
+
+    TG_(trace_out).seq++;
+
+    const HChar* fn_name = bb->inl_fn;
+    const HChar* obj_name = bb->obj ? bb->obj->name : "???";
+    const HChar* file_name = (bb->fn && bb->fn->file) ? bb->fn->file->name : "???";
+    UInt line = bb->line;
+
+    ULong deltas[64];
+    tl_assert(es->size <= 64);
+    if (current_cost && ti->last_sample_cost) {
+        for (i = 0; i < es->size; i++) {
+            deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+        }
+        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+    } else {
+        for (i = 0; i < es->size; i++) {
+            deltas[i] = 0;
+        }
+    }
+
+    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED,
+                    fn_name, obj_name, file_name, (Int)line,
+                    deltas, es->size);
+}
+
+void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
+{
+    Int i;
+
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
+
+    thread_info* ti = TG_(get_current_thread)();
+    if (!ti) return;
+
+    EventSet* es = TG_(sets).full;
+    FullCost current_cost = TG_(current_state).cost;
+
+    if (!ti->last_sample_cost) {
+        ti->last_sample_cost = TG_(get_eventset_cost)(es);
+        TG_(init_cost)(es, ti->last_sample_cost);
+    }
+
+    TG_(trace_out).seq++;
+
+    const HChar* fn_name = inl_fn;
+    const HChar* obj_name = bb->obj ? bb->obj->name : "???";
+    const HChar* file_name = (bb->fn && bb->fn->file) ? bb->fn->file->name : "???";
+    UInt line = bb->line;
+
+    ULong deltas[64];
+    tl_assert(es->size <= 64);
+    if (current_cost && ti->last_sample_cost) {
+        for (i = 0; i < es->size; i++) {
+            deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+        }
+        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+    } else {
+        for (i = 0; i < es->size; i++) {
+            deltas[i] = 0;
+        }
+    }
+
+    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED,
+                    fn_name, obj_name, file_name, (Int)line,
+                    deltas, es->size);
+}
+
 /*
  * Emit a FORK event when a child process is created.
  * Called from the post-syscall handler when fork/clone returns in parent.
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index e84e73cd9..ec6d02e14 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -593,6 +593,14 @@ fn_node* TG_(get_fn_node)(BB* bb)
                          &dirname, &filename, &fnname, &line_num, &di);
 
     DiEpoch ep = VG_(current_DiEpoch)();
+
+    /* Check if BB start address is in inlined code */
+    {
+        const HChar* inl_fn_name = NULL;
+        VG_(get_inline_fnname)(ep, bb_addr(bb), &inl_fn_name);
+        bb->inl_fn = inl_fn_name;  /* NULL if not inlined */
+    }
+
     if (0 == VG_(strcmp)(fnname, "???")) {
 	int p;
         static HChar buf[32];  // for sure large enough
diff --git a/tracegrind/global.h b/tracegrind/global.h
index d6a3d2b01..83898229e 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -81,10 +81,12 @@ typedef enum {
 
 /* Trace event types */
 typedef enum {
-   TG_EV_MARKER = 0,
-   TG_EV_ENTER  = 1,
-   TG_EV_EXIT   = 2,
-   TG_EV_FORK   = 3
+   TG_EV_MARKER         = 0,
+   TG_EV_ENTER          = 1,
+   TG_EV_EXIT           = 2,
+   TG_EV_FORK           = 3,
+   TG_EV_ENTER_INLINED  = 4,
+   TG_EV_EXIT_INLINED   = 5
 } TraceEventType;
 
 typedef struct _CommandLineOptions CommandLineOptions;
@@ -142,7 +144,7 @@ struct _Statistics {
   ULong bb_executions;
 
   Int  context_counter;
-  Int  bb_retranslations;  
+  Int  bb_retranslations;
 
   Int  distinct_objs;
   Int  distinct_files;
@@ -238,7 +240,7 @@ struct _jCC {
 };
 
 
-/* 
+/*
  * Info for one instruction of a basic block.
  */
 typedef struct _InstrInfo InstrInfo;
@@ -284,12 +286,12 @@ struct _BB {
 
   VgSectKind sect_kind;  /* section of this BB, e.g. PLT */
   UInt       instr_count;
-  
+
   /* filled by TG_(get_fn_node) if debug info is available */
   fn_node*   fn;          /* debug info for this BB */
   UInt       line;
   Bool       is_entry;    /* True if this BB is a function entry */
-        
+
   BBCC*      bbcc_list;  /* BBCCs for same BB (see next_bbcc in BBCC) */
   BBCC*      last_bbcc;  /* Temporary: Cached for faster access (LRU) */
 
@@ -299,6 +301,8 @@ struct _BB {
 			   * allocated directly after this struct */
   Bool       cjmp_inverted; /* is last side exit actually fall through? */
 
+  const HChar* inl_fn;    /* inlined function name at BB start, or NULL */
+
   UInt       instr_len;
   UInt       cost_count;
   InstrInfo  instr[0];   /* info on instruction sizes and costs */
@@ -365,12 +369,12 @@ struct _BBCC {
 			    * recursion BBCCs. Shared. */
     BBCC*    next_bbcc;    /* Chain of BBCCs for same BB */
     BBCC*    lru_next_bbcc; /* BBCC executed next the last time */
-    
+
     jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
     jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
-    FullCost skipped;      /* cost for skipped functions called from 
+    FullCost skipped;      /* cost for skipped functions called from
 			    * jmp_addr. Allocated lazy */
-    
+
     BBCC*    next;         /* entry chain in hash */
     ULong*   cost;         /* start of 64bit costs for this BBCC */
     ULong    ecounter_sum; /* execution counter for first instruction of BB */
@@ -432,7 +436,7 @@ struct _obj_node {
  *
  * <nonskipped> is 0 if the function called is not skipped (usual case).
  * Otherwise, it is the last non-skipped BBCC. This one gets all
- * the calls to non-skipped functions and all costs in skipped 
+ * the calls to non-skipped functions and all costs in skipped
  * instructions.
  */
 struct _call_entry {
@@ -464,14 +468,14 @@ struct _exec_state {
   /* the signum of the handler, 0 for main thread context
    */
   Int sig;
-  
+
   /* the old call stack pointer at entering the signal handler */
   Int orig_sp;
-  
+
   FullCost cost;
   Bool     collect;
   Context* cxt;
-  
+
   /* number of conditional jumps passed in last BB */
   Int   jmps_passed;
   BBCC* bbcc;      /* last BB executed */
@@ -491,7 +495,7 @@ typedef struct _cxt_hash cxt_hash;
 struct _cxt_hash {
   UInt size, entries;
   Context** table;
-};  
+};
 
 /* Thread specific state structures, i.e. parts of a thread state.
  * There are variables for the current state of each part,
@@ -540,7 +544,7 @@ struct _exec_stack {
   exec_state* entry[MAX_SIGHANDLERS];
 };
 
-/* Thread State 
+/* Thread State
  *
  * This structure stores thread specific info while a thread is *not*
  * running. See function switch_thread() for save/restore on thread switch.
@@ -562,6 +566,9 @@ struct _thread_info {
   /* CSV trace: per-thread snapshot of cost at last sample emission */
   FullCost last_sample_cost;
 
+  /* Inline tracking: current inlined function name (NULL if not in inlined code) */
+  const HChar* cur_inl_fn;
+
   /* thread specific data structure containers */
   fn_array fn_active;
   jcc_hash jccs;
@@ -580,7 +587,7 @@ struct cachesim_if
     void (*clear)(void);
     void (*printstat)(Int,Int,Int);
     void (*finish)(void);
-    
+
     void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
     void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
     void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
@@ -731,6 +738,8 @@ void TG_(run_post_signal_on_call_stack_bottom)(void);
 void TG_(trace_open_output)(void);
 void TG_(trace_reopen_child)(void);
 void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
+void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb);
+void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
 void TG_(trace_emit_fork)(ThreadId tid, Int child_pid);
 void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker);
 void TG_(trace_close_output)(void);
diff --git a/tracegrind/scripts/tracegrind-analyzer b/tracegrind/scripts/tracegrind-analyzer
index b703c7fa9..933a3b543 100755
--- a/tracegrind/scripts/tracegrind-analyzer
+++ b/tracegrind/scripts/tracegrind-analyzer
@@ -19,6 +19,7 @@ Examples:
 
 import argparse
 import json
+import os
 import struct
 import sys
 from collections import Counter
@@ -91,7 +92,7 @@ def decode_trace(filepath: str) -> Tuple[int, Dict[str, Any], List[List[Any]]]:
 
 def get_event_name(event_type: int) -> str:
     """Convert event type to name."""
-    return {0: 'MARKER', 1: 'ENTER', 2: 'EXIT', 3: 'FORK'}.get(event_type, f'UNKNOWN({event_type})')
+    return {0: 'MARKER', 1: 'ENTER', 2: 'EXIT', 3: 'FORK', 4: 'ENTER_INLINED', 5: 'EXIT_INLINED'}.get(event_type, f'UNKNOWN({event_type})')
 
 
 def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]:
@@ -165,7 +166,7 @@ def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
     # Function stats (for ENTER/EXIT events)
     fn_counts = Counter()
     for row in rows:
-        if len(row) > 3 and row[2] in (1, 2):  # ENTER or EXIT
+        if len(row) > 3 and row[2] in (1, 2, 4, 5):  # ENTER, EXIT, ENTER_INLINED, or EXIT_INLINED
             fn_counts[row[3]] += 1
 
     if fn_counts:
@@ -201,8 +202,8 @@ def print_rows(rows: List[List[Any]], schema: Dict[str, Any],
             # Compact single-line format
             parts = []
             for k, v in formatted.items():
-                if isinstance(v, str) and len(v) > 50:
-                    v = v[:47] + '...'
+                if isinstance(v, str) and k in ('obj', 'file'):
+                    v = os.path.basename(v)
                 parts.append(f"{k}={v}")
             print(' | '.join(parts))
 
@@ -226,7 +227,7 @@ def main():
                         help='Print raw row arrays')
     parser.add_argument('--json', action='store_true',
                         help='Output as JSON')
-    parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER', 'EXIT', 'FORK'],
+    parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER', 'EXIT', 'FORK', 'ENTER_INLINED', 'EXIT_INLINED'],
                         help='Filter by event type')
     parser.add_argument('--fn', type=str, metavar='PATTERN',
                         help='Filter by function name (substring match)')
@@ -248,7 +249,7 @@ def main():
     filtered_rows = rows
 
     if args.event:
-        event_map = {'MARKER': 0, 'ENTER': 1, 'EXIT': 2, 'FORK': 3}
+        event_map = {'MARKER': 0, 'ENTER': 1, 'EXIT': 2, 'FORK': 3, 'ENTER_INLINED': 4, 'EXIT_INLINED': 5}
         event_type = event_map[args.event]
         filtered_rows = [r for r in filtered_rows if len(r) > 2 and r[2] == event_type]
 
diff --git a/tracegrind/sim.c b/tracegrind/sim.c
index ee7260818..d1393d00f 100644
--- a/tracegrind/sim.c
+++ b/tracegrind/sim.c
@@ -1458,10 +1458,6 @@ static Bool cachesim_parse_opt(const HChar* arg)
    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
 
    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
-      if (clo_collect_cacheuse) {
-         /* Use counters only make sense with fine dumping */
-         TG_(clo).dump_instr = True;
-      }
    }
 
    else if (VG_(str_clo_cache_opt)(arg,
diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
index ec5fcd85b..74eb35b65 100644
--- a/tracegrind/tests/Makefile.am
+++ b/tracegrind/tests/Makefile.am
@@ -12,7 +12,9 @@ check_PROGRAMS = \
 	test_instr_toggle.bin \
 	test_toggle_collect.bin \
 	test_foo_bar_baz.bin \
-	test_inline.bin
+	test_inline.bin \
+	test_enter_inlined.bin \
+	test_nested_inlined.bin
 
 AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
 AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
@@ -24,6 +26,10 @@ test_toggle_collect_bin_SOURCES = test_toggle_collect.c
 test_foo_bar_baz_bin_SOURCES = test_foo_bar_baz.c
 test_inline_bin_SOURCES = test_inline.c
 test_inline_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+test_enter_inlined_bin_SOURCES = test_enter_inlined.c
+test_enter_inlined_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+test_nested_inlined_bin_SOURCES = test_nested_inlined.c
+test_nested_inlined_bin_CFLAGS = $(AM_CFLAGS) -O1 -g
 
 EXTRA_DIST = \
 	test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \
@@ -32,4 +38,6 @@ EXTRA_DIST = \
 	test_toggle_collect.vgtest test_toggle_collect.stderr.exp test_toggle_collect.post.exp \
 	test_foo_bar_baz.vgtest test_foo_bar_baz.stderr.exp test_foo_bar_baz.post.exp \
 	test_inline.vgtest test_inline.stderr.exp test_inline.post.exp \
+	test_enter_inlined.vgtest test_enter_inlined.stderr.exp test_enter_inlined.post.exp \
+	test_nested_inlined.vgtest test_nested_inlined.stderr.exp test_nested_inlined.post.exp \
 	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/filter_stderr b/tracegrind/tests/filter_stderr
index 67355a7b2..c62611e02 100755
--- a/tracegrind/tests/filter_stderr
+++ b/tracegrind/tests/filter_stderr
@@ -5,7 +5,7 @@ dir=`dirname $0`
 $dir/../../tests/filter_stderr_basic                |
 
 # Remove "Tracegrind, ..." line and the following copyright line.
-sed "/^Tracegrind, a call-graph generating cache profiler/ , /./ d" |
+sed "/^Tracegrind, a streaming trace cache profiler/ , /./ d" |
 
 # Remove pointer to tracegrind_control
 sed "/^For interactive control,.*$/d" |
@@ -33,4 +33,4 @@ sed "/warning: specified LL cache: line_size .*$/d" |
 sed "/warning: simulated LL cache: line_size .*$/d" |
 
 # Remove trace output file path messages
-sed "/^Trace output to /d"
+sed "/^Streaming trace output to /d"
diff --git a/tracegrind/tests/test_basic.post.exp b/tracegrind/tests/test_basic.post.exp
index 173b52f57..d6fbcf234 100644
--- a/tracegrind/tests/test_basic.post.exp
+++ b/tracegrind/tests/test_basic.post.exp
@@ -7,6 +7,8 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
 Total rows: N
 
@@ -16,3 +18,4 @@ Events by type:
 
 Threads: 1
 Sequence range: N - N
+
diff --git a/tracegrind/tests/test_enter_inlined.c b/tracegrind/tests/test_enter_inlined.c
new file mode 100644
index 000000000..7ab5593eb
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.c
@@ -0,0 +1,32 @@
+#include "tracegrind.h"
+
+/* Force inlining - with --read-inline-info=yes these should produce
+ * ENTER_INLINED / EXIT_INLINED events in the trace */
+static inline __attribute__((always_inline)) int inlined_work(int a, int b) {
+    /* Make the function large enough to span multiple basic blocks
+     * so at least one BB boundary falls inside inlined code */
+    int result = 0;
+    if (a > 0) {
+        result = a * b;
+    } else {
+        result = a + b;
+    }
+    return result;
+}
+
+/* Prevent inlining - SHOULD appear as ENTER/EXIT */
+static int __attribute__((noinline)) not_inlined_caller(int n) {
+    /* Use volatile to prevent constant propagation */
+    volatile int x = n;
+    return inlined_work(x, x + 1);
+}
+
+int main(void) {
+    volatile int input = 3;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = not_inlined_caller(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return result != 12;
+}
diff --git a/tracegrind/tests/test_enter_inlined.post.exp b/tracegrind/tests/test_enter_inlined.post.exp
new file mode 100644
index 000000000..4cbc80bd4
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.post.exp
@@ -0,0 +1,19 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_enter_inlined.stderr.exp b/tracegrind/tests/test_enter_inlined.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_enter_inlined.vgtest b/tracegrind/tests/test_enter_inlined.vgtest
new file mode 100644
index 000000000..e8d628355
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_enter_inlined.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_enter_inlined --instr-atstart=no --read-inline-info=yes
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_enter_inlined.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_enter_inlined.msgpack.lz4
diff --git a/tracegrind/tests/test_foo_bar_baz.post.exp b/tracegrind/tests/test_foo_bar_baz.post.exp
index 3b6b2d2d3..9f89daf35 100644
--- a/tracegrind/tests/test_foo_bar_baz.post.exp
+++ b/tracegrind/tests/test_foo_bar_baz.post.exp
@@ -7,17 +7,19 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=foo | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=baz | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=bar | test_foo_bar_baz | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=foo | test_foo_bar_baz | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_inline.post.exp b/tracegrind/tests/test_inline.post.exp
index 1aa9c05b2..276e3d3e3 100644
--- a/tracegrind/tests/test_inline.post.exp
+++ b/tracegrind/tests/test_inline.post.exp
@@ -7,9 +7,11 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=not_inlined_work | test_inline | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=not_inlined_work | test_inline | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_instr_toggle.post.exp b/tracegrind/tests/test_instr_toggle.post.exp
index dba045bc2..4a2fa95a9 100644
--- a/tracegrind/tests/test_instr_toggle.post.exp
+++ b/tracegrind/tests/test_instr_toggle.post.exp
@@ -7,13 +7,15 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=before-fibo
-seq=N | tid=1 | event=ENTER | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=fibo | test_instr_toggle | ... | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=after-fibo
diff --git a/tracegrind/tests/test_marker.post.exp b/tracegrind/tests/test_marker.post.exp
index b3b813b1a..7158215da 100644
--- a/tracegrind/tests/test_marker.post.exp
+++ b/tracegrind/tests/test_marker.post.exp
@@ -7,6 +7,8 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
 Showing N of N rows
 [N, 1, 0, 'start-work']
diff --git a/tracegrind/tests/test_nested_inlined.c b/tracegrind/tests/test_nested_inlined.c
new file mode 100644
index 000000000..2c1ca6c33
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.c
@@ -0,0 +1,47 @@
+#include "tracegrind.h"
+
+/* Inner inlined function.
+ * With --read-inline-info=yes, should produce ENTER_INLINED / EXIT_INLINED
+ * events with fn=inner_inline. */
+static inline __attribute__((always_inline)) int inner_inline(int a) {
+    int result;
+    if (a > 0) {
+        result = a * 3;
+    } else {
+        result = a + 1;
+    }
+    return result;
+}
+
+/* Outer inlined function - calls inner_inline.
+ * Should produce ENTER_INLINED events for both outer_inline and inner_inline,
+ * showing nested inline transitions.
+ * Uses volatile stores in both branches to prevent the compiler from
+ * converting the if-else to a branchless cmov. */
+static inline __attribute__((always_inline)) int outer_inline(int a, int b) {
+    volatile int x;
+    if (a > b) {
+        x = a - b;
+    } else {
+        x = b - a;
+    }
+    int y = inner_inline(x);
+    return y + a;
+}
+
+/* Non-inlined caller */
+static int __attribute__((noinline)) caller(int n) {
+    volatile int x = n;
+    return outer_inline(x, x + 1);
+}
+
+int main(void) {
+    volatile int input = 5;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = caller(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    /* caller(5) -> outer_inline(5, 6): x=1, inner_inline(1)=3, 3+5=8 */
+    return result != 8;
+}
diff --git a/tracegrind/tests/test_nested_inlined.post.exp b/tracegrind/tests/test_nested_inlined.post.exp
new file mode 100644
index 000000000..382cf103a
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.post.exp
@@ -0,0 +1,21 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=26 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_nested_inlined.stderr.exp b/tracegrind/tests/test_nested_inlined.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_nested_inlined.vgtest b/tracegrind/tests/test_nested_inlined.vgtest
new file mode 100644
index 000000000..ff512078c
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_nested_inlined.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_nested_inlined --instr-atstart=no --read-inline-info=yes
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_nested_inlined.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_nested_inlined.msgpack.lz4
diff --git a/tracegrind/tests/test_schema.post.exp b/tracegrind/tests/test_schema.post.exp
index 86f5f9a21..4b2b56d7b 100644
--- a/tracegrind/tests/test_schema.post.exp
+++ b/tracegrind/tests/test_schema.post.exp
@@ -7,4 +7,6 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
diff --git a/tracegrind/tests/test_toggle_collect.post.exp b/tracegrind/tests/test_toggle_collect.post.exp
index 173b52f57..d6fbcf234 100644
--- a/tracegrind/tests/test_toggle_collect.post.exp
+++ b/tracegrind/tests/test_toggle_collect.post.exp
@@ -7,6 +7,8 @@ Event Schemas (discriminated union):
   1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
   3 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
 
 Total rows: N
 
@@ -16,3 +18,4 @@ Events by type:
 
 Threads: 1
 Sequence range: N - N
+

From f2334e77d64218536d28f220e0b4d03ad71aca32 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 02:00:39 +0000
Subject: [PATCH 12/26] feat(tracegrind): proper nested inline tracking via
 InlIPCursor stack diffing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace flat single-pointer inline tracking with a per-BB inline call
stack built via Valgrind's InlIPCursor API. BB-to-BB transitions now
diff the old and new inline stacks to emit the minimal EXIT/ENTER
sequence, producing correct containment (ENTER outer → ENTER inner →
EXIT inner → EXIT outer) instead of flat transitions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/bb.c                               |  5 ++-
 tracegrind/bbcc.c                             | 36 ++++++++++++++-----
 tracegrind/callstack.c                        | 26 +++++++-------
 tracegrind/dump.c                             |  4 +--
 tracegrind/fn.c                               | 27 +++++++++++---
 tracegrind/global.h                           | 13 ++++---
 tracegrind/tests/test_nested_inlined.post.exp |  2 +-
 7 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/tracegrind/bb.c b/tracegrind/bb.c
index e34021004..9c69e7c6e 100644
--- a/tracegrind/bb.c
+++ b/tracegrind/bb.c
@@ -143,7 +143,8 @@ static BB* new_bb(obj_node* obj, PtrdiffT offset,
    bb->fn          = 0;
    bb->line        = 0;
    bb->is_entry    = 0;
-   bb->inl_fn      = NULL;
+   bb->inl_fns     = NULL;
+   bb->inl_depth   = 0;
    bb->bbcc_list   = 0;
    bb->last_bbcc   = 0;
 
@@ -332,6 +333,8 @@ void TG_(delete_bb)(Addr addr)
     if (bb->bbcc_list == 0) {
 	/* can be safely deleted */
 
+	if (bb->inl_fns) VG_(free)(bb->inl_fns);
+
 	/* Fill the block up with junk and then free it, so we will
 	   hopefully get a segfault if it is used again by mistake. */
 	size = sizeof(BB)
diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
index a511f6bc5..597f61e64 100644
--- a/tracegrind/bbcc.c
+++ b/tracegrind/bbcc.c
@@ -831,17 +831,37 @@ void TG_(setup_bbcc)(BB* bb)
   
   TG_(current_state).bbcc = bbcc;
 
-  /* Check for inline function transition */
+  /* Check for inline function transitions */
   if (TG_(current_state).collect) {
       thread_info* ti = TG_(get_current_thread)();
-      if (ti && bb->inl_fn != ti->cur_inl_fn) {
-          if (ti->cur_inl_fn != NULL) {
-              TG_(trace_emit_exit_inlined)(TG_(current_tid), bb, ti->cur_inl_fn);
+      if (ti) {
+          UInt old_depth = ti->cur_inl_depth;
+          UInt new_depth = bb->inl_depth;
+
+          /* Fast path: both empty (most BBs) */
+          if (old_depth != 0 || new_depth != 0) {
+              /* Find longest common prefix */
+              UInt common = 0;
+              UInt min_depth = old_depth < new_depth ? old_depth : new_depth;
+              while (common < min_depth &&
+                     ti->cur_inl_fns[common] == bb->inl_fns[common])
+                  common++;
+
+              /* EXIT from deepest down to common level */
+              for (Int i = (Int)old_depth - 1; i >= (Int)common; i--)
+                  TG_(trace_emit_exit_inlined)(TG_(current_tid), bb,
+                                                ti->cur_inl_fns[i]);
+
+              /* ENTER from common level up to new deepest */
+              for (UInt i = common; i < new_depth; i++)
+                  TG_(trace_emit_enter_inlined)(TG_(current_tid), bb,
+                                                 bb->inl_fns[i]);
+
+              /* Update thread state */
+              for (UInt i = 0; i < new_depth; i++)
+                  ti->cur_inl_fns[i] = bb->inl_fns[i];
+              ti->cur_inl_depth = new_depth;
           }
-          if (bb->inl_fn != NULL) {
-              TG_(trace_emit_enter_inlined)(TG_(current_tid), bb);
-          }
-          ti->cur_inl_fn = bb->inl_fn;
       }
   }
 
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index dd694b493..1cc288dfc 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -234,13 +234,14 @@ void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
 
     /* Emit trace sample on function entry */
     if (!skip && TG_(current_state).collect) {
-	/* Emit EXIT_INLINED if we're entering a new function while inside inlined code */
+	/* Exit entire inline stack, deepest first */
 	thread_info* ti = TG_(get_current_thread)();
-	if (ti && ti->cur_inl_fn != NULL && TG_(current_state).bbcc) {
-	    TG_(trace_emit_exit_inlined)(TG_(current_tid),
-					 TG_(current_state).bbcc->bb,
-					 ti->cur_inl_fn);
-	    ti->cur_inl_fn = NULL;
+	if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
+	    for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
+		TG_(trace_emit_exit_inlined)(TG_(current_tid),
+					     TG_(current_state).bbcc->bb,
+					     ti->cur_inl_fns[i]);
+	    ti->cur_inl_depth = 0;
 	}
 	fn_node* to_fn = to->cxt->fn[0];
 	TG_(trace_emit_sample)(TG_(current_tid), True, to_fn);
@@ -343,13 +344,14 @@ void TG_(pop_call_stack)(void)
 
 	/* Emit trace sample on function exit */
 	if (TG_(current_state).collect) {
-	    /* Emit EXIT_INLINED if we're leaving while inside inlined code */
+	    /* Exit entire inline stack, deepest first */
 	    thread_info* ti = TG_(get_current_thread)();
-	    if (ti && ti->cur_inl_fn != NULL && TG_(current_state).bbcc) {
-		TG_(trace_emit_exit_inlined)(TG_(current_tid),
-					     TG_(current_state).bbcc->bb,
-					     ti->cur_inl_fn);
-		ti->cur_inl_fn = NULL;
+	    if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
+		for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
+		    TG_(trace_emit_exit_inlined)(TG_(current_tid),
+						 TG_(current_state).bbcc->bb,
+						 ti->cur_inl_fns[i]);
+		ti->cur_inl_depth = 0;
 	    }
 	    TG_(trace_emit_sample)(TG_(current_tid), False, to_fn);
 	}
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index 811b5c5ae..c5a058027 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -453,7 +453,7 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
                     deltas, es->size);
 }
 
-void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb)
+void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
 {
     Int i;
 
@@ -473,7 +473,7 @@ void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb)
 
     TG_(trace_out).seq++;
 
-    const HChar* fn_name = bb->inl_fn;
+    const HChar* fn_name = inl_fn;
     const HChar* obj_name = bb->obj ? bb->obj->name : "???";
     const HChar* file_name = (bb->fn && bb->fn->file) ? bb->fn->file->name : "???";
     UInt line = bb->line;
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index ec6d02e14..710dfd00d 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -594,11 +594,30 @@ fn_node* TG_(get_fn_node)(BB* bb)
 
     DiEpoch ep = VG_(current_DiEpoch)();
 
-    /* Check if BB start address is in inlined code */
+    /* Build inline stack for this BB using InlIPCursor */
     {
-        const HChar* inl_fn_name = NULL;
-        VG_(get_inline_fnname)(ep, bb_addr(bb), &inl_fn_name);
-        bb->inl_fn = inl_fn_name;  /* NULL if not inlined */
+        InlIPCursor* iipc = VG_(new_IIPC)(ep, bb_addr(bb));
+        if (iipc) {
+            const HChar* tmp[TG_MAX_INL_DEPTH + 1];
+            Int total = 0;
+            do {
+                const HChar* fn_name = NULL;
+                VG_(get_fnname_inl)(ep, bb_addr(bb), &fn_name, iipc);
+                if (fn_name && total < TG_MAX_INL_DEPTH + 1)
+                    tmp[total++] = fn_name;
+            } while (VG_(next_IIPC)(iipc));
+            VG_(delete_IIPC)(iipc);
+
+            /* tmp[] is innermost-first; last entry is the non-inlined function (skip it) */
+            Int inl_count = total - 1;
+            if (inl_count > 0) {
+                bb->inl_depth = inl_count;
+                bb->inl_fns = VG_(malloc)("tg.bb.inl", inl_count * sizeof(HChar*));
+                /* Reverse into outermost-first order */
+                for (Int i = 0; i < inl_count; i++)
+                    bb->inl_fns[i] = tmp[inl_count - 1 - i];
+            }
+        }
     }
 
     if (0 == VG_(strcmp)(fnname, "???")) {
diff --git a/tracegrind/global.h b/tracegrind/global.h
index 83898229e..398ed6bd7 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -59,6 +59,9 @@
 /* Enable experimental features? */
 #define TG_EXPERIMENTAL 0
 
+/* Maximum depth of inline call stack tracking */
+#define TG_MAX_INL_DEPTH 16
+
 
 /*------------------------------------------------------------*/
 /*--- Command line options                                 ---*/
@@ -301,7 +304,8 @@ struct _BB {
 			   * allocated directly after this struct */
   Bool       cjmp_inverted; /* is last side exit actually fall through? */
 
-  const HChar* inl_fn;    /* inlined function name at BB start, or NULL */
+  const HChar** inl_fns;  /* inlined fn names at BB start (outermost first), or NULL */
+  UInt       inl_depth;  /* number of entries in inl_fns */
 
   UInt       instr_len;
   UInt       cost_count;
@@ -566,8 +570,9 @@ struct _thread_info {
   /* CSV trace: per-thread snapshot of cost at last sample emission */
   FullCost last_sample_cost;
 
-  /* Inline tracking: current inlined function name (NULL if not in inlined code) */
-  const HChar* cur_inl_fn;
+  /* Inline tracking: current inline call stack (outermost first) */
+  const HChar* cur_inl_fns[TG_MAX_INL_DEPTH];
+  UInt cur_inl_depth;
 
   /* thread specific data structure containers */
   fn_array fn_active;
@@ -738,7 +743,7 @@ void TG_(run_post_signal_on_call_stack_bottom)(void);
 void TG_(trace_open_output)(void);
 void TG_(trace_reopen_child)(void);
 void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
-void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb);
+void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
 void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
 void TG_(trace_emit_fork)(ThreadId tid, Int child_pid);
 void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker);
diff --git a/tracegrind/tests/test_nested_inlined.post.exp b/tracegrind/tests/test_nested_inlined.post.exp
index 382cf103a..d69724cbc 100644
--- a/tracegrind/tests/test_nested_inlined.post.exp
+++ b/tracegrind/tests/test_nested_inlined.post.exp
@@ -14,8 +14,8 @@ Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=ENTER | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
 seq=N | tid=1 | event=ENTER_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=26 | Ir=N
-seq=N | tid=1 | event=EXIT_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
 seq=N | tid=1 | event=ENTER_INLINED | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
 seq=N | tid=1 | event=EXIT_INLINED | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
 seq=N | tid=1 | event=EXIT | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end

From 7d55d1cd7234db754add2a56ed0cee3f3aa2d896 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 02:29:39 +0000
Subject: [PATCH 13/26] test(tracegrind): add control flow regression tests

Add tests for signal handling, C++ exceptions, longjmp, tail calls,
and deep recursion (100 levels) to verify call stack correctness
across non-trivial control flow. Also fix missing -I include path
for tracegrind.h in test Makefile.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/tests/Makefile.am               | 19 ++++++++-
 tracegrind/tests/test_exception.cpp        | 42 +++++++++++++++++++
 tracegrind/tests/test_exception.post.exp   | 14 +++++++
 tracegrind/tests/test_exception.stderr.exp |  6 +++
 tracegrind/tests/test_exception.vgtest     |  5 +++
 tracegrind/tests/test_longjmp.c            | 47 ++++++++++++++++++++++
 tracegrind/tests/test_longjmp.post.exp     | 14 +++++++
 tracegrind/tests/test_longjmp.stderr.exp   |  6 +++
 tracegrind/tests/test_longjmp.vgtest       |  5 +++
 tracegrind/tests/test_recursion.c          | 26 ++++++++++++
 tracegrind/tests/test_recursion.post.exp   |  4 ++
 tracegrind/tests/test_recursion.stderr.exp |  6 +++
 tracegrind/tests/test_recursion.vgtest     |  5 +++
 tracegrind/tests/test_signal.c             | 39 ++++++++++++++++++
 tracegrind/tests/test_signal.post.exp      | 10 +++++
 tracegrind/tests/test_signal.stderr.exp    |  6 +++
 tracegrind/tests/test_signal.vgtest        |  5 +++
 tracegrind/tests/test_tailcall.c           | 33 +++++++++++++++
 tracegrind/tests/test_tailcall.post.exp    | 14 +++++++
 tracegrind/tests/test_tailcall.stderr.exp  |  6 +++
 tracegrind/tests/test_tailcall.vgtest      |  5 +++
 21 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 tracegrind/tests/test_exception.cpp
 create mode 100644 tracegrind/tests/test_exception.post.exp
 create mode 100644 tracegrind/tests/test_exception.stderr.exp
 create mode 100644 tracegrind/tests/test_exception.vgtest
 create mode 100644 tracegrind/tests/test_longjmp.c
 create mode 100644 tracegrind/tests/test_longjmp.post.exp
 create mode 100644 tracegrind/tests/test_longjmp.stderr.exp
 create mode 100644 tracegrind/tests/test_longjmp.vgtest
 create mode 100644 tracegrind/tests/test_recursion.c
 create mode 100644 tracegrind/tests/test_recursion.post.exp
 create mode 100644 tracegrind/tests/test_recursion.stderr.exp
 create mode 100644 tracegrind/tests/test_recursion.vgtest
 create mode 100644 tracegrind/tests/test_signal.c
 create mode 100644 tracegrind/tests/test_signal.post.exp
 create mode 100644 tracegrind/tests/test_signal.stderr.exp
 create mode 100644 tracegrind/tests/test_signal.vgtest
 create mode 100644 tracegrind/tests/test_tailcall.c
 create mode 100644 tracegrind/tests/test_tailcall.post.exp
 create mode 100644 tracegrind/tests/test_tailcall.stderr.exp
 create mode 100644 tracegrind/tests/test_tailcall.vgtest

diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
index 74eb35b65..d84826a8b 100644
--- a/tracegrind/tests/Makefile.am
+++ b/tracegrind/tests/Makefile.am
@@ -14,8 +14,14 @@ check_PROGRAMS = \
 	test_foo_bar_baz.bin \
 	test_inline.bin \
 	test_enter_inlined.bin \
-	test_nested_inlined.bin
+	test_nested_inlined.bin \
+	test_signal.bin \
+	test_exception.bin \
+	test_longjmp.bin \
+	test_tailcall.bin \
+	test_recursion.bin
 
+AM_CPPFLAGS += -I$(top_srcdir)/tracegrind
 AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
 AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
 
@@ -30,6 +36,12 @@ test_enter_inlined_bin_SOURCES = test_enter_inlined.c
 test_enter_inlined_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
 test_nested_inlined_bin_SOURCES = test_nested_inlined.c
 test_nested_inlined_bin_CFLAGS = $(AM_CFLAGS) -O1 -g
+test_signal_bin_SOURCES = test_signal.c
+test_exception_bin_SOURCES = test_exception.cpp
+test_longjmp_bin_SOURCES = test_longjmp.c
+test_tailcall_bin_SOURCES = test_tailcall.c
+test_tailcall_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+test_recursion_bin_SOURCES = test_recursion.c
 
 EXTRA_DIST = \
 	test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \
@@ -40,4 +52,9 @@ EXTRA_DIST = \
 	test_inline.vgtest test_inline.stderr.exp test_inline.post.exp \
 	test_enter_inlined.vgtest test_enter_inlined.stderr.exp test_enter_inlined.post.exp \
 	test_nested_inlined.vgtest test_nested_inlined.stderr.exp test_nested_inlined.post.exp \
+	test_signal.vgtest test_signal.stderr.exp test_signal.post.exp \
+	test_exception.vgtest test_exception.stderr.exp test_exception.post.exp \
+	test_longjmp.vgtest test_longjmp.stderr.exp test_longjmp.post.exp \
+	test_tailcall.vgtest test_tailcall.stderr.exp test_tailcall.post.exp \
+	test_recursion.vgtest test_recursion.stderr.exp test_recursion.post.exp \
 	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/test_exception.cpp b/tracegrind/tests/test_exception.cpp
new file mode 100644
index 000000000..8b791728a
--- /dev/null
+++ b/tracegrind/tests/test_exception.cpp
@@ -0,0 +1,42 @@
+#include "tracegrind.h"
+#include <stdexcept>
+
+/*
+ * Test: C++ exception unwinding through regular (non-inlined) functions.
+ *
+ * catcher() calls thrower(), which calls do_throw().
+ * do_throw() throws an exception that unwinds back through thrower()
+ * to catcher()'s catch block. Verifies the call stack is properly
+ * maintained across exception unwinding.
+ *
+ * Call chain:  catcher -> thrower -> do_throw (throws)
+ */
+
+static void __attribute__((noinline)) do_throw(int x) {
+    if (x > 0)
+        throw std::runtime_error("boom");
+}
+
+static int __attribute__((noinline)) thrower(int n) {
+    volatile int x = n;
+    do_throw(x);
+    return x;
+}
+
+static int __attribute__((noinline)) catcher(int n) {
+    try {
+        return thrower(n);
+    } catch (const std::exception&) {
+        return -1;
+    }
+}
+
+int main() {
+    volatile int input = 5;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = catcher(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return result != -1;
+}
diff --git a/tracegrind/tests/test_exception.post.exp b/tracegrind/tests/test_exception.post.exp
new file mode 100644
index 000000000..f1a06baf7
--- /dev/null
+++ b/tracegrind/tests/test_exception.post.exp
@@ -0,0 +1,14 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_exception.stderr.exp b/tracegrind/tests/test_exception.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_exception.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_exception.vgtest b/tracegrind/tests/test_exception.vgtest
new file mode 100644
index 000000000..3c098a799
--- /dev/null
+++ b/tracegrind/tests/test_exception.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_exception.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_exception --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_exception.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=catcher|fn=thrower|fn=do_throw)'
+cleanup: rm -f tracegrind.out.test_exception.msgpack.lz4
diff --git a/tracegrind/tests/test_longjmp.c b/tracegrind/tests/test_longjmp.c
new file mode 100644
index 000000000..ffca450f8
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.c
@@ -0,0 +1,47 @@
+#include "tracegrind.h"
+#include <setjmp.h>
+
+/*
+ * Test: longjmp unwinding multiple call frames.
+ *
+ * outer() calls middle(), which calls inner().
+ * inner() does longjmp back to outer(), skipping middle()'s return.
+ * Verifies tracegrind properly unwinds the call stack on non-local jumps.
+ *
+ * Call chain:  outer -> middle -> inner (longjmp back to outer)
+ */
+
+static jmp_buf env;
+
+static void __attribute__((noinline)) inner(int n) {
+    volatile int x = n * 2;
+    (void)x;
+    longjmp(env, 42);
+}
+
+static void __attribute__((noinline)) middle(int n) {
+    volatile int x = n + 1;
+    inner(x);
+    /* never reached */
+    x = x + 1;
+}
+
+static int __attribute__((noinline)) outer(int n) {
+    int val = setjmp(env);
+    if (val == 0) {
+        middle(n);
+        /* never reached */
+        return -1;
+    }
+    return val;
+}
+
+int main(void) {
+    volatile int input = 5;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = outer(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return result != 42;
+}
diff --git a/tracegrind/tests/test_longjmp.post.exp b/tracegrind/tests/test_longjmp.post.exp
new file mode 100644
index 000000000..2577e4828
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.post.exp
@@ -0,0 +1,14 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_longjmp.stderr.exp b/tracegrind/tests/test_longjmp.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_longjmp.vgtest b/tracegrind/tests/test_longjmp.vgtest
new file mode 100644
index 000000000..2ee68b2af
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_longjmp.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_longjmp --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_longjmp.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=outer|fn=middle|fn=inner)'
+cleanup: rm -f tracegrind.out.test_longjmp.msgpack.lz4
diff --git a/tracegrind/tests/test_recursion.c b/tracegrind/tests/test_recursion.c
new file mode 100644
index 000000000..0d96b0cca
--- /dev/null
+++ b/tracegrind/tests/test_recursion.c
@@ -0,0 +1,26 @@
+#include "tracegrind.h"
+
+/*
+ * Test: deep recursion (100 levels).
+ *
+ * recurse() calls itself 100 times, then returns back through
+ * all frames. Verifies the call stack handles deep nesting and
+ * produces balanced ENTER/EXIT pairs.
+ */
+
+static int __attribute__((noinline)) recurse(int depth) {
+    volatile int d = depth;
+    if (d <= 0)
+        return 0;
+    return recurse(d - 1) + 1;
+}
+
+int main(void) {
+    volatile int input = 100;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = recurse(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return result != 100;
+}
diff --git a/tracegrind/tests/test_recursion.post.exp b/tracegrind/tests/test_recursion.post.exp
new file mode 100644
index 000000000..c92dd1288
--- /dev/null
+++ b/tracegrind/tests/test_recursion.post.exp
@@ -0,0 +1,4 @@
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=MARKER | marker=end
+ENTER count: 101
+EXIT count: 101
diff --git a/tracegrind/tests/test_recursion.stderr.exp b/tracegrind/tests/test_recursion.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_recursion.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_recursion.vgtest b/tracegrind/tests/test_recursion.vgtest
new file mode 100644
index 000000000..fcb7fb43e
--- /dev/null
+++ b/tracegrind/tests/test_recursion.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_recursion.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_recursion --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_recursion.msgpack.lz4 | ./filter_trace | grep -E '(event=MARKER|fn=recurse)' | awk '/MARKER/{print} /ENTER/{e++} /EXIT/{x++} END{print "ENTER count: "e; print "EXIT count: "x}'
+cleanup: rm -f tracegrind.out.test_recursion.msgpack.lz4
diff --git a/tracegrind/tests/test_signal.c b/tracegrind/tests/test_signal.c
new file mode 100644
index 000000000..3bfa48d0e
--- /dev/null
+++ b/tracegrind/tests/test_signal.c
@@ -0,0 +1,39 @@
+#include "tracegrind.h"
+#include <signal.h>
+#include <string.h>
+
+/*
+ * Test: signal handler interrupting normal function execution.
+ *
+ * caller() raises SIGALRM to itself. The signal handler (handler_fn)
+ * runs, then execution returns to caller(). Verifies the call stack
+ * is properly maintained across signal delivery.
+ */
+
+static volatile sig_atomic_t got_signal = 0;
+
+static void __attribute__((noinline)) handler_fn(int sig) {
+    (void)sig;
+    got_signal = 1;
+}
+
+static int __attribute__((noinline)) caller(int n) {
+    volatile int x = n;
+    raise(SIGALRM);
+    return x + 1;
+}
+
+int main(void) {
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_handler = handler_fn;
+    sigaction(SIGALRM, &sa, NULL);
+
+    volatile int input = 5;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = caller(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return (result != 6) || !got_signal;
+}
diff --git a/tracegrind/tests/test_signal.post.exp b/tracegrind/tests/test_signal.post.exp
new file mode 100644
index 000000000..dfb11ddde
--- /dev/null
+++ b/tracegrind/tests/test_signal.post.exp
@@ -0,0 +1,10 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_signal.stderr.exp b/tracegrind/tests/test_signal.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_signal.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_signal.vgtest b/tracegrind/tests/test_signal.vgtest
new file mode 100644
index 000000000..a61fbd7bd
--- /dev/null
+++ b/tracegrind/tests/test_signal.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_signal.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_signal --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_signal.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=caller|fn=handler_fn)'
+cleanup: rm -f tracegrind.out.test_signal.msgpack.lz4
diff --git a/tracegrind/tests/test_tailcall.c b/tracegrind/tests/test_tailcall.c
new file mode 100644
index 000000000..4a2868e70
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.c
@@ -0,0 +1,33 @@
+#include "tracegrind.h"
+
+/*
+ * Test: tail call optimization.
+ *
+ * chain_a() tail-calls chain_b(), which tail-calls chain_c().
+ * At -O2, the compiler should optimize these into JMP instructions
+ * rather than CALL+RET. Verifies tracegrind handles sibling calls.
+ *
+ * Call chain:  chain_a --(tail call)--> chain_b --(tail call)--> chain_c
+ */
+
+static int __attribute__((noinline)) chain_c(int n) {
+    return n + 3;
+}
+
+static int __attribute__((noinline)) chain_b(int n) {
+    return chain_c(n + 2);
+}
+
+static int __attribute__((noinline)) chain_a(int n) {
+    return chain_b(n + 1);
+}
+
+int main(void) {
+    volatile int input = 10;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    int result = chain_a(input);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return result != 16;
+}
diff --git a/tracegrind/tests/test_tailcall.post.exp b/tracegrind/tests/test_tailcall.post.exp
new file mode 100644
index 000000000..8cb6e349f
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.post.exp
@@ -0,0 +1,14 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_tailcall.stderr.exp b/tracegrind/tests/test_tailcall.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_tailcall.vgtest b/tracegrind/tests/test_tailcall.vgtest
new file mode 100644
index 000000000..f954ada67
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_tailcall.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_tailcall --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_tailcall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=chain_)'
+cleanup: rm -f tracegrind.out.test_tailcall.msgpack.lz4

From 7a65b7980e0c17a43ff379d915810ce395400748 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 03:02:53 +0000
Subject: [PATCH 14/26] feat(tracegrind): add THREAD_CREATE event and
 rename/reorder event types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Emit THREAD_CREATE (type 6) when new threads are spawned, using
VG_(track_pre_thread_ll_create). Suppress spurious FORK events for
pthread_create by checking CLONE_THREAD flag in clone/clone3 syscalls.

Rename events for consistency: ENTER→ENTER_FN, EXIT→EXIT_FN,
ENTER_INLINED→ENTER_INLINED_FN, EXIT_INLINED→EXIT_INLINED_FN.
Reorder: ENTER_INLINED_FN=3, EXIT_INLINED_FN=4, FORK=5.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/docs/tracegrind-msgpack-format.md  | 63 ++++++++++++----
 tracegrind/dump.c                             | 73 ++++++++++++++-----
 tracegrind/global.h                           | 14 ++--
 tracegrind/main.c                             | 33 ++++++++-
 tracegrind/scripts/tracegrind-analyzer        | 19 +++--
 tracegrind/tests/Makefile.am                  |  6 +-
 tracegrind/tests/filter_trace                 |  3 +
 tracegrind/tests/test_basic.post.exp          | 15 ++--
 tracegrind/tests/test_enter_inlined.post.exp  | 19 ++---
 tracegrind/tests/test_exception.post.exp      | 12 +--
 tracegrind/tests/test_foo_bar_baz.post.exp    | 31 ++++----
 tracegrind/tests/test_inline.post.exp         | 15 ++--
 tracegrind/tests/test_instr_toggle.post.exp   | 23 +++---
 tracegrind/tests/test_longjmp.post.exp        | 12 +--
 tracegrind/tests/test_marker.post.exp         | 11 +--
 tracegrind/tests/test_nested_inlined.post.exp | 23 +++---
 tracegrind/tests/test_recursion.post.exp      |  4 +-
 tracegrind/tests/test_recursion.vgtest        |  2 +-
 tracegrind/tests/test_schema.post.exp         | 11 +--
 tracegrind/tests/test_signal.post.exp         |  4 +-
 tracegrind/tests/test_tailcall.post.exp       | 12 +--
 tracegrind/tests/test_thread_create.c         | 18 +++++
 tracegrind/tests/test_thread_create.post.exp  | 17 +++++
 .../tests/test_thread_create.stderr.exp       |  6 ++
 tracegrind/tests/test_thread_create.vgtest    |  5 ++
 tracegrind/tests/test_toggle_collect.post.exp | 15 ++--
 26 files changed, 315 insertions(+), 151 deletions(-)
 create mode 100644 tracegrind/tests/test_thread_create.c
 create mode 100644 tracegrind/tests/test_thread_create.post.exp
 create mode 100644 tracegrind/tests/test_thread_create.stderr.exp
 create mode 100644 tracegrind/tests/test_thread_create.vgtest

diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index fbf7b71dc..f25975e2e 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -23,7 +23,7 @@ Tracegrind produces a binary trace file combining MsgPack serialization with LZ4
 | Offset | Size | Field   | Description |
 |--------|------|---------|-------------|
 | 0      | 4    | magic   | ASCII `TGMP` (0x54 0x47 0x4D 0x50) |
-| 4      | 4    | version | Format version, uint32 LE (currently 2) |
+| 4      | 4    | version | Format version, uint32 LE (currently 3) |
 
 ## Chunk Format
 
@@ -41,13 +41,16 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 
 ```json
 {
-    "version": 2,
+    "version": 3,
     "format": "tracegrind-msgpack",
     "event_schemas": {
         "0": ["seq", "tid", "event", "marker"],
         "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
         "2": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
-        "3": ["seq", "tid", "event", "child_pid"]
+        "3": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "4": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "5": ["seq", "tid", "event", "child_pid"],
+        "6": ["seq", "tid", "event", "child_tid"]
     }
 }
 ```
@@ -57,9 +60,12 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 | Type | Name   | Description |
 |------|--------|-------------|
 | 0    | MARKER | Named marker |
-| 1    | ENTER  | Function entry |
-| 2    | EXIT   | Function exit |
-| 3    | FORK   | Child process created |
+| 1    | ENTER_FN  | Function entry |
+| 2    | EXIT_FN | Function exit |
+| 3    | ENTER_INLINED_FN | Inlined function entry |
+| 4    | EXIT_INLINED_FN  | Inlined function exit |
+| 5    | FORK   | Child process created |
+| 6    | THREAD_CREATE | New thread created |
 
 ### Row Schemas
 
@@ -72,31 +78,55 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 | 2     | event  | int    | 0 = MARKER |
 | 3     | marker | string | Marker label |
 
-**ENTER/EXIT rows (event 1, 2):**
+**ENTER_FN/EXIT_FN rows (event 1, 2):**
 
 | Index | Name  | Type   | Description |
 |-------|-------|--------|-------------|
 | 0     | seq   | uint64 | Sequence number |
 | 1     | tid   | int32  | Thread ID |
-| 2     | event | int    | 1 = ENTER, 2 = EXIT |
+| 2     | event | int    | 1 = ENTER_FN, 2 = EXIT_FN |
 | 3     | fn    | string | Function name |
 | 4     | obj   | string | Shared object path |
 | 5     | file  | string | Source file path |
 | 6     | line  | int32  | Line number (0 if unknown) |
 | 7+    | ...   | int64  | Event counter deltas (Ir, Dr, Dw, etc.) |
 
-**FORK rows (event 3):**
+**ENTER_INLINED_FN/EXIT_INLINED_FN rows (event 3, 4):**
+
+Same schema as ENTER_FN/EXIT_FN rows.
+
+| Index | Name  | Type   | Description |
+|-------|-------|--------|-------------|
+| 0     | seq   | uint64 | Sequence number |
+| 1     | tid   | int32  | Thread ID |
+| 2     | event | int    | 3 = ENTER_INLINED_FN, 4 = EXIT_INLINED_FN |
+| 3     | fn    | string | Function name |
+| 4     | obj   | string | Shared object path |
+| 5     | file  | string | Source file path |
+| 6     | line  | int32  | Line number (0 if unknown) |
+| 7+    | ...   | int64  | Event counter deltas (Ir, Dr, Dw, etc.) |
+
+**FORK rows (event 5):**
 
 | Index | Name      | Type   | Description |
 |-------|-----------|--------|-------------|
 | 0     | seq       | uint64 | Sequence number |
 | 1     | tid       | int32  | Thread ID that called fork |
-| 2     | event     | int    | 3 = FORK |
+| 2     | event     | int    | 5 = FORK |
 | 3     | child_pid | int32  | PID of the new child process |
 
+**THREAD_CREATE rows (event 6):**
+
+| Index | Name      | Type   | Description |
+|-------|-----------|--------|-------------|
+| 0     | seq       | uint64 | Sequence number |
+| 1     | tid       | int32  | Thread ID that created the new thread |
+| 2     | event     | int    | 6 = THREAD_CREATE |
+| 3     | child_tid | int32  | Thread ID of the new child thread |
+
 ### Event Counter Columns
 
-For ENTER/EXIT rows, event counters appear as delta values starting at index 7. Which counters are present depends on Tracegrind options:
+For ENTER_FN/EXIT_FN/ENTER_INLINED_FN/EXIT_INLINED_FN rows, event counters appear as delta values starting at index 7. Which counters are present depends on Tracegrind options:
 
 `Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`
 
@@ -106,9 +136,12 @@ Each data chunk contains concatenated MsgPack arrays. The row format depends on
 
 ```
 [seq, tid, 0, marker]                               # MARKER
-[seq, tid, 1, fn, obj, file, line, delta_Ir, ...]   # ENTER
-[seq, tid, 2, fn, obj, file, line, delta_Ir, ...]   # EXIT
-[seq, tid, 3, child_pid]                             # FORK
+[seq, tid, 1, fn, obj, file, line, delta_Ir, ...]   # ENTER_FN
+[seq, tid, 2, fn, obj, file, line, delta_Ir, ...]   # EXIT_FN
+[seq, tid, 3, fn, obj, file, line, delta_Ir, ...]   # ENTER_INLINED_FN
+[seq, tid, 4, fn, obj, file, line, delta_Ir, ...]   # EXIT_INLINED_FN
+[seq, tid, 5, child_pid]                             # FORK
+[seq, tid, 6, child_tid]                             # THREAD_CREATE
 ```
 
 The reference implementation writes 4096 rows per chunk.
@@ -126,7 +159,7 @@ def read_tracegrind(filepath):
     with open(filepath, 'rb') as f:
         assert f.read(4) == b'TGMP'
         version = struct.unpack('<I', f.read(4))[0]
-        assert version == 2
+        assert version == 3
 
         # Read schema chunk
         usize, csize = struct.unpack('<II', f.read(8))
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index c5a058027..95697aa9f 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -124,7 +124,7 @@ static void msgpack_write_header(void)
 
     /* event_schemas - discriminated union: each event type has its own schema */
     msgpack_write_key(&hdr, "event_schemas");
-    msgpack_write_map_header(&hdr, 6);  /* 6 event types: MARKER, ENTER, EXIT, FORK, ENTER_INLINED, EXIT_INLINED */
+    msgpack_write_map_header(&hdr, 7);  /* 7 event types */
 
     /* Event type 0 (MARKER) schema */
     msgpack_write_key(&hdr, "0");
@@ -134,42 +134,50 @@ static void msgpack_write_header(void)
     msgpack_write_str(&hdr, "event", -1);
     msgpack_write_str(&hdr, "marker", -1);
 
-    /* Event type 1 (ENTER) schema */
+    /* Event type 1 (ENTER_FN) schema */
     msgpack_write_key(&hdr, "1");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
-    /* Event type 2 (EXIT) schema - same as ENTER */
+    /* Event type 2 (EXIT_FN) schema - same as ENTER_FN */
     msgpack_write_key(&hdr, "2");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
-    /* Event type 3 (FORK) schema */
+    /* Event type 3 (ENTER_INLINED_FN) schema - same columns as ENTER_FN/EXIT_FN */
     msgpack_write_key(&hdr, "3");
-    msgpack_write_array_header(&hdr, 4);
-    msgpack_write_str(&hdr, "seq", -1);
-    msgpack_write_str(&hdr, "tid", -1);
-    msgpack_write_str(&hdr, "event", -1);
-    msgpack_write_str(&hdr, "child_pid", -1);
-
-    /* Event type 4 (ENTER_INLINED) schema - same columns as ENTER/EXIT */
-    msgpack_write_key(&hdr, "4");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
-    /* Event type 5 (EXIT_INLINED) schema - same columns as ENTER/EXIT */
-    msgpack_write_key(&hdr, "5");
+    /* Event type 4 (EXIT_INLINED_FN) schema - same columns as ENTER_FN/EXIT_FN */
+    msgpack_write_key(&hdr, "4");
     msgpack_write_array_header(&hdr, mp_state.ncols);
     for (Int i = 0; i < mp_state.ncols; i++) {
         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
     }
 
+    /* Event type 5 (FORK) schema */
+    msgpack_write_key(&hdr, "5");
+    msgpack_write_array_header(&hdr, 4);
+    msgpack_write_str(&hdr, "seq", -1);
+    msgpack_write_str(&hdr, "tid", -1);
+    msgpack_write_str(&hdr, "event", -1);
+    msgpack_write_str(&hdr, "child_pid", -1);
+
+    /* Event type 6 (THREAD_CREATE) schema */
+    msgpack_write_key(&hdr, "6");
+    msgpack_write_array_header(&hdr, 4);
+    msgpack_write_str(&hdr, "seq", -1);
+    msgpack_write_str(&hdr, "tid", -1);
+    msgpack_write_str(&hdr, "event", -1);
+    msgpack_write_str(&hdr, "child_tid", -1);
+
     /* Compress and write header chunk */
     SizeT src_size = hdr.size;
     SizeT dst_capacity = tg_lz4_compress_bound(src_size);
@@ -263,7 +271,7 @@ static void msgpack_add_row(ULong seq, Int tid, Int event,
     /* Fixed columns */
     msgpack_write_uint(&mp_state.buf, seq);
     msgpack_write_int(&mp_state.buf, tid);
-    msgpack_write_int(&mp_state.buf, event);  /* 0=ENTER, 1=EXIT */
+    msgpack_write_int(&mp_state.buf, event);
     msgpack_write_str(&mp_state.buf, fn_name, -1);
     msgpack_write_str(&mp_state.buf, obj_name, -1);
     msgpack_write_str(&mp_state.buf, file_name, -1);
@@ -289,7 +297,7 @@ static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid)
     msgpack_write_array_header(&mp_state.buf, 4);
     msgpack_write_uint(&mp_state.buf, seq);
     msgpack_write_int(&mp_state.buf, tid);
-    msgpack_write_int(&mp_state.buf, TG_EV_FORK);  /* 2 = FORK */
+    msgpack_write_int(&mp_state.buf, TG_EV_FORK);
     msgpack_write_int(&mp_state.buf, child_pid);
 
     mp_state.rows_in_chunk++;
@@ -300,6 +308,22 @@ static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid)
     }
 }
 
+/* Add a THREAD_CREATE row to the msgpack output (seq, tid, event, child_tid) */
+static void msgpack_add_thread_create_row(ULong seq, Int tid, Int child_tid)
+{
+    msgpack_write_array_header(&mp_state.buf, 4);
+    msgpack_write_uint(&mp_state.buf, seq);
+    msgpack_write_int(&mp_state.buf, tid);
+    msgpack_write_int(&mp_state.buf, TG_EV_THREAD_CREATE);
+    msgpack_write_int(&mp_state.buf, child_tid);
+
+    mp_state.rows_in_chunk++;
+
+    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+        msgpack_flush_chunk();
+    }
+}
+
 /* Add a MARKER row to the msgpack output (seq, tid, event, marker_str) */
 static void msgpack_add_marker_row(ULong seq, Int tid, const HChar* marker)
 {
@@ -445,8 +469,7 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
         }
     }
 
-    /* Event type: 0=ENTER, 1=EXIT */
-    Int event_val = is_enter ? TG_EV_ENTER : TG_EV_EXIT;
+    Int event_val = is_enter ? TG_EV_ENTER_FN : TG_EV_EXIT_FN;
 
     msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val,
                     fn_name, obj_name, file_name, (Int)line,
@@ -491,7 +514,7 @@ void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
         }
     }
 
-    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED,
+    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED_FN,
                     fn_name, obj_name, file_name, (Int)line,
                     deltas, es->size);
 }
@@ -534,7 +557,7 @@ void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
         }
     }
 
-    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED,
+    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED_FN,
                     fn_name, obj_name, file_name, (Int)line,
                     deltas, es->size);
 }
@@ -555,6 +578,16 @@ void TG_(trace_emit_fork)(ThreadId tid, Int child_pid)
     msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid);
 }
 
+void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child)
+{
+    if (!TG_(trace_out).initialized) return;
+    if (TG_(trace_out).fd < 0) return;
+
+    TG_(trace_out).seq++;
+
+    msgpack_add_thread_create_row(TG_(trace_out).seq, (Int)tid, (Int)child);
+}
+
 void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker)
 {
     if (!TG_(trace_out).initialized) return;
diff --git a/tracegrind/global.h b/tracegrind/global.h
index 398ed6bd7..f0312ed6b 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -84,12 +84,13 @@ typedef enum {
 
 /* Trace event types */
 typedef enum {
-   TG_EV_MARKER         = 0,
-   TG_EV_ENTER          = 1,
-   TG_EV_EXIT           = 2,
-   TG_EV_FORK           = 3,
-   TG_EV_ENTER_INLINED  = 4,
-   TG_EV_EXIT_INLINED   = 5
+   TG_EV_MARKER            = 0,
+   TG_EV_ENTER_FN          = 1,
+   TG_EV_EXIT_FN           = 2,
+   TG_EV_ENTER_INLINED_FN  = 3,
+   TG_EV_EXIT_INLINED_FN   = 4,
+   TG_EV_FORK              = 5,
+   TG_EV_THREAD_CREATE     = 6
 } TraceEventType;
 
 typedef struct _CommandLineOptions CommandLineOptions;
@@ -746,6 +747,7 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
 void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
 void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
 void TG_(trace_emit_fork)(ThreadId tid, Int child_pid);
+void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child);
 void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker);
 void TG_(trace_close_output)(void);
 
diff --git a/tracegrind/main.c b/tracegrind/main.c
index a691178bb..fb1345fb2 100644
--- a/tracegrind/main.c
+++ b/tracegrind/main.c
@@ -1742,11 +1742,27 @@ static
 void TG_(post_syscall)(ThreadId tid, UInt syscallno,
                        UWord* args, UInt nArgs, SysRes res)
 {
-  /* Handle fork/clone: emit FORK event with child PID */
+  /* Handle fork/clone: emit FORK event with child PID.
+     Skip if this was a thread-creating clone (CLONE_THREAD),
+     since we emit THREAD_CREATE via track_pre_thread_ll_create instead. */
   if (is_fork_syscall(syscallno) && !sr_isError(res) && sr_Res(res) > 0) {
-    /* We're in the parent, sr_Res(res) is the child PID */
-    Int child_pid = (Int)sr_Res(res);
-    TG_(trace_emit_fork)(tid, child_pid);
+    Bool is_thread = False;
+#if defined(VGO_linux)
+    if (syscallno == __NR_clone && nArgs > 0)
+      is_thread = (args[0] & VKI_CLONE_THREAD) != 0;
+#  if defined(__NR_clone3)
+    if (syscallno == __NR_clone3 && nArgs > 0) {
+      /* clone3 first arg is pointer to struct clone_args;
+         flags is the first field (ULong / __u64). */
+      ULong flags = *(ULong*)(Addr)args[0];
+      is_thread = (flags & VKI_CLONE_THREAD) != 0;
+    }
+#  endif
+#endif
+    if (!is_thread) {
+      Int child_pid = (Int)sr_Res(res);
+      TG_(trace_emit_fork)(tid, child_pid);
+    }
   }
 
   /* Handle systime collection if enabled */
@@ -2003,6 +2019,14 @@ static void tg_atfork_child(ThreadId tid)
    TG_(trace_reopen_child)();
 }
 
+static void tg_pre_thread_ll_create(ThreadId tid, ThreadId child)
+{
+    /* Skip Valgrind's internal scheduler thread (tid 0) creating the
+       initial client thread -- that's not a user-visible thread creation. */
+    if (tid == 0) return;
+    TG_(trace_emit_thread_create)(tid, child);
+}
+
 static
 void TG_(post_clo_init)(void)
 {
@@ -2128,6 +2152,7 @@ void TG_(pre_clo_init)(void)
     VG_(track_start_client_code)  ( & tg_start_client_code_callback );
     VG_(track_pre_deliver_signal) ( & TG_(pre_signal) );
     VG_(track_post_deliver_signal)( & TG_(post_signal) );
+    VG_(track_pre_thread_ll_create)( & tg_pre_thread_ll_create );
 
     TG_(set_clo_defaults)();
 
diff --git a/tracegrind/scripts/tracegrind-analyzer b/tracegrind/scripts/tracegrind-analyzer
index 933a3b543..93b4d249f 100755
--- a/tracegrind/scripts/tracegrind-analyzer
+++ b/tracegrind/scripts/tracegrind-analyzer
@@ -92,7 +92,7 @@ def decode_trace(filepath: str) -> Tuple[int, Dict[str, Any], List[List[Any]]]:
 
 def get_event_name(event_type: int) -> str:
     """Convert event type to name."""
-    return {0: 'MARKER', 1: 'ENTER', 2: 'EXIT', 3: 'FORK', 4: 'ENTER_INLINED', 5: 'EXIT_INLINED'}.get(event_type, f'UNKNOWN({event_type})')
+    return {0: 'MARKER', 1: 'ENTER_FN', 2: 'EXIT_FN', 3: 'ENTER_INLINED_FN', 4: 'EXIT_INLINED_FN', 5: 'FORK', 6: 'THREAD_CREATE'}.get(event_type, f'UNKNOWN({event_type})')
 
 
 def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]:
@@ -166,7 +166,7 @@ def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
     # Function stats (for ENTER/EXIT events)
     fn_counts = Counter()
     for row in rows:
-        if len(row) > 3 and row[2] in (1, 2, 4, 5):  # ENTER, EXIT, ENTER_INLINED, or EXIT_INLINED
+        if len(row) > 3 and row[2] in (1, 2, 3, 4):  # ENTER_FN, EXIT_FN, ENTER_INLINED_FN, or EXIT_INLINED_FN
             fn_counts[row[3]] += 1
 
     if fn_counts:
@@ -175,7 +175,7 @@ def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
             print(f"  {count:8,}  {fn}")
 
     # FORK events
-    fork_rows = [row for row in rows if len(row) > 2 and row[2] == 3]
+    fork_rows = [row for row in rows if len(row) > 2 and row[2] == 5]
     if fork_rows:
         print(f"\nFork events: {len(fork_rows)}")
         for row in fork_rows[:5]:
@@ -183,6 +183,15 @@ def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
             child_pid = formatted.get('child_pid', 'unknown')
             print(f"  seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_pid={child_pid}")
 
+    # THREAD_CREATE events
+    thread_create_rows = [row for row in rows if len(row) > 2 and row[2] == 6]
+    if thread_create_rows:
+        print(f"\nThread create events: {len(thread_create_rows)}")
+        for row in thread_create_rows[:5]:
+            formatted = format_row(row, schema)
+            child_tid = formatted.get('child_tid', 'unknown')
+            print(f"  seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_tid={child_tid}")
+
 
 def print_rows(rows: List[List[Any]], schema: Dict[str, Any],
                head: int | None = None, raw: bool = False, as_json: bool = False) -> None:
@@ -227,7 +236,7 @@ def main():
                         help='Print raw row arrays')
     parser.add_argument('--json', action='store_true',
                         help='Output as JSON')
-    parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER', 'EXIT', 'FORK', 'ENTER_INLINED', 'EXIT_INLINED'],
+    parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER_FN', 'EXIT_FN', 'ENTER_INLINED_FN', 'EXIT_INLINED_FN', 'FORK', 'THREAD_CREATE'],
                         help='Filter by event type')
     parser.add_argument('--fn', type=str, metavar='PATTERN',
                         help='Filter by function name (substring match)')
@@ -249,7 +258,7 @@ def main():
     filtered_rows = rows
 
     if args.event:
-        event_map = {'MARKER': 0, 'ENTER': 1, 'EXIT': 2, 'FORK': 3, 'ENTER_INLINED': 4, 'EXIT_INLINED': 5}
+        event_map = {'MARKER': 0, 'ENTER_FN': 1, 'EXIT_FN': 2, 'ENTER_INLINED_FN': 3, 'EXIT_INLINED_FN': 4, 'FORK': 5, 'THREAD_CREATE': 6}
         event_type = event_map[args.event]
         filtered_rows = [r for r in filtered_rows if len(r) > 2 and r[2] == event_type]
 
diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
index d84826a8b..893d785f5 100644
--- a/tracegrind/tests/Makefile.am
+++ b/tracegrind/tests/Makefile.am
@@ -19,7 +19,8 @@ check_PROGRAMS = \
 	test_exception.bin \
 	test_longjmp.bin \
 	test_tailcall.bin \
-	test_recursion.bin
+	test_recursion.bin \
+	test_thread_create.bin
 
 AM_CPPFLAGS += -I$(top_srcdir)/tracegrind
 AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
@@ -42,6 +43,8 @@ test_longjmp_bin_SOURCES = test_longjmp.c
 test_tailcall_bin_SOURCES = test_tailcall.c
 test_tailcall_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
 test_recursion_bin_SOURCES = test_recursion.c
+test_thread_create_bin_SOURCES = test_thread_create.c
+test_thread_create_bin_LDADD = -lpthread
 
 EXTRA_DIST = \
 	test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \
@@ -57,4 +60,5 @@ EXTRA_DIST = \
 	test_longjmp.vgtest test_longjmp.stderr.exp test_longjmp.post.exp \
 	test_tailcall.vgtest test_tailcall.stderr.exp test_tailcall.post.exp \
 	test_recursion.vgtest test_recursion.stderr.exp test_recursion.post.exp \
+	test_thread_create.vgtest test_thread_create.stderr.exp test_thread_create.post.exp \
 	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/filter_trace b/tracegrind/tests/filter_trace
index 6e2065baf..28b92e918 100755
--- a/tracegrind/tests/filter_trace
+++ b/tracegrind/tests/filter_trace
@@ -44,6 +44,9 @@ sed '/^Top 10 functions/,/^$/d' |
 # Remove "Fork events" section (platform-dependent)
 sed '/^Fork events/,/^$/d' |
 
+# Remove "Thread create events" section (platform-dependent)
+sed '/^Thread create events/,/^$/d' |
+
 # Normalize seq numbers in raw arrays: [1234, ...] -> [N, ...]
 sed 's/^\[\([0-9]\+\),/[N,/g' |
 
diff --git a/tracegrind/tests/test_basic.post.exp b/tracegrind/tests/test_basic.post.exp
index d6fbcf234..c1c546e90 100644
--- a/tracegrind/tests/test_basic.post.exp
+++ b/tracegrind/tests/test_basic.post.exp
@@ -4,17 +4,18 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Total rows: N
 
 Events by type:
-  ENTER: N (P%)
-  EXIT: N (P%)
+  ENTER_FN: N (P%)
+  EXIT_FN: N (P%)
 
 Threads: 1
 Sequence range: N - N
diff --git a/tracegrind/tests/test_enter_inlined.post.exp b/tracegrind/tests/test_enter_inlined.post.exp
index 4cbc80bd4..2af0e323f 100644
--- a/tracegrind/tests/test_enter_inlined.post.exp
+++ b/tracegrind/tests/test_enter_inlined.post.exp
@@ -4,16 +4,17 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER_INLINED | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
-seq=N | tid=1 | event=EXIT_INLINED | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED_FN | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED_FN | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_exception.post.exp b/tracegrind/tests/test_exception.post.exp
index f1a06baf7..7f57de18e 100644
--- a/tracegrind/tests/test_exception.post.exp
+++ b/tracegrind/tests/test_exception.post.exp
@@ -5,10 +5,10 @@ Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_foo_bar_baz.post.exp b/tracegrind/tests/test_foo_bar_baz.post.exp
index 9f89daf35..5f71183f9 100644
--- a/tracegrind/tests/test_foo_bar_baz.post.exp
+++ b/tracegrind/tests/test_foo_bar_baz.post.exp
@@ -4,22 +4,23 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_inline.post.exp b/tracegrind/tests/test_inline.post.exp
index 276e3d3e3..7156df4a3 100644
--- a/tracegrind/tests/test_inline.post.exp
+++ b/tracegrind/tests/test_inline.post.exp
@@ -4,14 +4,15 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_instr_toggle.post.exp b/tracegrind/tests/test_instr_toggle.post.exp
index 4a2fa95a9..5c6df34b4 100644
--- a/tracegrind/tests/test_instr_toggle.post.exp
+++ b/tracegrind/tests/test_instr_toggle.post.exp
@@ -4,18 +4,19 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=before-fibo
-seq=N | tid=1 | event=ENTER | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=after-fibo
diff --git a/tracegrind/tests/test_longjmp.post.exp b/tracegrind/tests/test_longjmp.post.exp
index 2577e4828..47ff2d6de 100644
--- a/tracegrind/tests/test_longjmp.post.exp
+++ b/tracegrind/tests/test_longjmp.post.exp
@@ -5,10 +5,10 @@ Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_marker.post.exp b/tracegrind/tests/test_marker.post.exp
index 7158215da..f2d89fdd8 100644
--- a/tracegrind/tests/test_marker.post.exp
+++ b/tracegrind/tests/test_marker.post.exp
@@ -4,11 +4,12 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Showing N of N rows
 [N, 1, 0, 'start-work']
diff --git a/tracegrind/tests/test_nested_inlined.post.exp b/tracegrind/tests/test_nested_inlined.post.exp
index d69724cbc..ca8fc06cc 100644
--- a/tracegrind/tests/test_nested_inlined.post.exp
+++ b/tracegrind/tests/test_nested_inlined.post.exp
@@ -4,18 +4,19 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=26 | Ir=N
-seq=N | tid=1 | event=ENTER_INLINED | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
-seq=N | tid=1 | event=EXIT_INLINED | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
-seq=N | tid=1 | event=EXIT_INLINED | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED_FN | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=26 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED_FN | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED_FN | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED_FN | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_recursion.post.exp b/tracegrind/tests/test_recursion.post.exp
index c92dd1288..06977039b 100644
--- a/tracegrind/tests/test_recursion.post.exp
+++ b/tracegrind/tests/test_recursion.post.exp
@@ -1,4 +1,4 @@
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=MARKER | marker=end
-ENTER count: 101
-EXIT count: 101
+ENTER_FN count: 101
+EXIT_FN count: 101
diff --git a/tracegrind/tests/test_recursion.vgtest b/tracegrind/tests/test_recursion.vgtest
index fcb7fb43e..a675bd483 100644
--- a/tracegrind/tests/test_recursion.vgtest
+++ b/tracegrind/tests/test_recursion.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_recursion.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_recursion --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_recursion.msgpack.lz4 | ./filter_trace | grep -E '(event=MARKER|fn=recurse)' | awk '/MARKER/{print} /ENTER/{e++} /EXIT/{x++} END{print "ENTER count: "e; print "EXIT count: "x}'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_recursion.msgpack.lz4 | ./filter_trace | grep -E '(event=MARKER|fn=recurse)' | awk '/MARKER/{print} /ENTER_FN/{e++} /EXIT_FN/{x++} END{print "ENTER_FN count: "e; print "EXIT_FN count: "x}'
 cleanup: rm -f tracegrind.out.test_recursion.msgpack.lz4
diff --git a/tracegrind/tests/test_schema.post.exp b/tracegrind/tests/test_schema.post.exp
index 4b2b56d7b..e9dfbb564 100644
--- a/tracegrind/tests/test_schema.post.exp
+++ b/tracegrind/tests/test_schema.post.exp
@@ -4,9 +4,10 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
diff --git a/tracegrind/tests/test_signal.post.exp b/tracegrind/tests/test_signal.post.exp
index dfb11ddde..e6a840785 100644
--- a/tracegrind/tests/test_signal.post.exp
+++ b/tracegrind/tests/test_signal.post.exp
@@ -5,6 +5,6 @@ Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_tailcall.post.exp b/tracegrind/tests/test_tailcall.post.exp
index 8cb6e349f..2cd177ff5 100644
--- a/tracegrind/tests/test_tailcall.post.exp
+++ b/tracegrind/tests/test_tailcall.post.exp
@@ -5,10 +5,10 @@ Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
-seq=N | tid=1 | event=ENTER | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
-seq=N | tid=1 | event=ENTER | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
-seq=N | tid=1 | event=EXIT | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_thread_create.c b/tracegrind/tests/test_thread_create.c
new file mode 100644
index 000000000..f04b0b167
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.c
@@ -0,0 +1,18 @@
+#include "tracegrind.h"
+#include <pthread.h>
+
+static void *thread_fn(void *arg) {
+    (void)arg;
+    return NULL;
+}
+
+int main(void) {
+    pthread_t t;
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    pthread_create(&t, NULL, thread_fn, NULL);
+    pthread_join(t, NULL);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    return 0;
+}
diff --git a/tracegrind/tests/test_thread_create.post.exp b/tracegrind/tests/test_thread_create.post.exp
new file mode 100644
index 000000000..c10af4718
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.post.exp
@@ -0,0 +1,17 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
+seq=N | tid=2 | event=ENTER_FN | fn=thread_fn | obj=test_thread_create.bin | file=??? | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=thread_fn | obj=test_thread_create.bin | file=??? | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_thread_create.stderr.exp b/tracegrind/tests/test_thread_create.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_thread_create.vgtest b/tracegrind/tests/test_thread_create.vgtest
new file mode 100644
index 000000000..1c47b1e23
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_thread_create.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_thread_create --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_create.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|MARKER|THREAD_CREATE|FORK|thread_fn|Showing |\()' | sed 's/child_pid=[0-9]*/child_pid=N/'
+cleanup: rm -f tracegrind.out.test_thread_create.msgpack.lz4
diff --git a/tracegrind/tests/test_toggle_collect.post.exp b/tracegrind/tests/test_toggle_collect.post.exp
index d6fbcf234..c1c546e90 100644
--- a/tracegrind/tests/test_toggle_collect.post.exp
+++ b/tracegrind/tests/test_toggle_collect.post.exp
@@ -4,17 +4,18 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (FORK): ['seq', 'tid', 'event', 'child_pid']
-  4 (ENTER_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  5 (EXIT_INLINED): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
 
 Total rows: N
 
 Events by type:
-  ENTER: N (P%)
-  EXIT: N (P%)
+  ENTER_FN: N (P%)
+  EXIT_FN: N (P%)
 
 Threads: 1
 Sequence range: N - N

From c1f86fabdf73bb3e349a225fe5c3d0640edbf245 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 03:11:21 +0000
Subject: [PATCH 15/26] test(tracegrind): add syscall regression test with
 collect-systime=nsec

Verify that syscall instruction counts and timing (sysCount, sysTime,
sysCpuTime) are properly attributed to libc wrapper functions (getpid,
write) when --collect-systime=nsec is enabled. Nonzero timing values
on EXIT_FN events are normalized to T to assert measurement occurred.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/tests/Makefile.am             |  5 ++++-
 tracegrind/tests/filter_trace            |  5 +++++
 tracegrind/tests/test_syscall.c          | 28 ++++++++++++++++++++++++
 tracegrind/tests/test_syscall.post.exp   | 24 ++++++++++++++++++++
 tracegrind/tests/test_syscall.stderr.exp |  6 +++++
 tracegrind/tests/test_syscall.vgtest     |  5 +++++
 6 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 tracegrind/tests/test_syscall.c
 create mode 100644 tracegrind/tests/test_syscall.post.exp
 create mode 100644 tracegrind/tests/test_syscall.stderr.exp
 create mode 100644 tracegrind/tests/test_syscall.vgtest

diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
index 893d785f5..e94b7d81e 100644
--- a/tracegrind/tests/Makefile.am
+++ b/tracegrind/tests/Makefile.am
@@ -20,7 +20,8 @@ check_PROGRAMS = \
 	test_longjmp.bin \
 	test_tailcall.bin \
 	test_recursion.bin \
-	test_thread_create.bin
+	test_thread_create.bin \
+	test_syscall.bin
 
 AM_CPPFLAGS += -I$(top_srcdir)/tracegrind
 AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
@@ -45,6 +46,7 @@ test_tailcall_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
 test_recursion_bin_SOURCES = test_recursion.c
 test_thread_create_bin_SOURCES = test_thread_create.c
 test_thread_create_bin_LDADD = -lpthread
+test_syscall_bin_SOURCES = test_syscall.c
 
 EXTRA_DIST = \
 	test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \
@@ -61,4 +63,5 @@ EXTRA_DIST = \
 	test_tailcall.vgtest test_tailcall.stderr.exp test_tailcall.post.exp \
 	test_recursion.vgtest test_recursion.stderr.exp test_recursion.post.exp \
 	test_thread_create.vgtest test_thread_create.stderr.exp test_thread_create.post.exp \
+	test_syscall.vgtest test_syscall.stderr.exp test_syscall.post.exp \
 	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/filter_trace b/tracegrind/tests/filter_trace
index 28b92e918..1ccbbf46a 100755
--- a/tracegrind/tests/filter_trace
+++ b/tracegrind/tests/filter_trace
@@ -20,6 +20,11 @@ sed 's|file=[^ |]*[/]||g' |
 # Replace Ir=<number> with Ir=N
 sed 's|Ir=[0-9]\+|Ir=N|g' |
 
+# Normalize syscall timing values (non-deterministic)
+# Replace nonzero sysTime/sysCpuTime with >0 to assert they are measured
+sed 's|sysTime=[1-9][0-9]*|sysTime=T|g' |
+sed 's|sysCpuTime=[1-9][0-9]*|sysCpuTime=T|g' |
+
 # Remove the separator line
 sed '/^-\{10,\}$/d' |
 
diff --git a/tracegrind/tests/test_syscall.c b/tracegrind/tests/test_syscall.c
new file mode 100644
index 000000000..a3001f17b
--- /dev/null
+++ b/tracegrind/tests/test_syscall.c
@@ -0,0 +1,28 @@
+#include "tracegrind.h"
+#include <unistd.h>
+#include <fcntl.h>
+
+static int __attribute__((noinline)) do_getpid(void) {
+    return getpid();
+}
+
+static void __attribute__((noinline)) do_write(int fd) {
+    const char msg[] = "hello\n";
+    write(fd, msg, sizeof(msg) - 1);
+}
+
+static void __attribute__((noinline)) caller(int fd) {
+    do_getpid();
+    do_write(fd);
+}
+
+int main(void) {
+    int fd = open("/dev/null", O_WRONLY);
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+    caller(fd);
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+    close(fd);
+    return 0;
+}
diff --git a/tracegrind/tests/test_syscall.post.exp b/tracegrind/tests/test_syscall.post.exp
new file mode 100644
index 000000000..5bb4734e3
--- /dev/null
+++ b/tracegrind/tests/test_syscall.post.exp
@@ -0,0 +1,24 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=do_getpid | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=getpid | obj=libc.so.6 | file=syscall-template.S | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=EXIT_FN | fn=getpid | obj=libc.so.6 | file=syscall-template.S | line=0 | Ir=N | sysCount=1 | sysTime=T | sysCpuTime=T
+seq=N | tid=1 | event=EXIT_FN | fn=do_getpid | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=do_write | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=write | obj=libc.so.6 | file=write.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=EXIT_FN | fn=write | obj=libc.so.6 | file=write.c | line=0 | Ir=N | sysCount=1 | sysTime=T | sysCpuTime=T
+seq=N | tid=1 | event=EXIT_FN | fn=do_write | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_syscall.stderr.exp b/tracegrind/tests/test_syscall.stderr.exp
new file mode 100644
index 000000000..838c3d735
--- /dev/null
+++ b/tracegrind/tests/test_syscall.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir sysCount sysTime sysCpuTime
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_syscall.vgtest b/tracegrind/tests/test_syscall.vgtest
new file mode 100644
index 000000000..e60251069
--- /dev/null
+++ b/tracegrind/tests/test_syscall.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_syscall.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_syscall --instr-atstart=no --collect-systime=nsec
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
+cleanup: rm -f tracegrind.out.test_syscall.msgpack.lz4

From ef5bcde5ba07e2aa5e90d492cfda4608ba962836 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 14:11:56 +0000
Subject: [PATCH 16/26] feat(tracegrind): add creator and version metadata to
 output header

Include creator ("valgrind-tracegrind") and creator_version fields in
the schema chunk so consumers can identify which tool and version
produced the trace file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/docs/tracegrind-msgpack-format.md |  2 ++
 tracegrind/dump.c                            | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index f25975e2e..1cfd94da0 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -43,6 +43,8 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 {
     "version": 3,
     "format": "tracegrind-msgpack",
+    "creator": "valgrind-tracegrind",
+    "creator_version": "3.26.0.codspeed",
     "event_schemas": {
         "0": ["seq", "tid", "event", "marker"],
         "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index 95697aa9f..da03d1c1f 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -112,7 +112,7 @@ static void msgpack_write_header(void)
     msgpack_init(&hdr, 2048);
 
     /* Header is a map with metadata */
-    msgpack_write_map_header(&hdr, 3);
+    msgpack_write_map_header(&hdr, 5);
 
     /* version */
     msgpack_write_key(&hdr, "version");
@@ -122,6 +122,14 @@ static void msgpack_write_header(void)
     msgpack_write_key(&hdr, "format");
     msgpack_write_str(&hdr, "tracegrind-msgpack", -1);
 
+    /* creator */
+    msgpack_write_key(&hdr, "creator");
+    msgpack_write_str(&hdr, "valgrind-tracegrind", -1);
+
+    /* creator_version */
+    msgpack_write_key(&hdr, "creator_version");
+    msgpack_write_str(&hdr, VERSION, -1);
+
     /* event_schemas - discriminated union: each event type has its own schema */
     msgpack_write_key(&hdr, "event_schemas");
     msgpack_write_map_header(&hdr, 7);  /* 7 event types */

From 974acaef7f3ca9aeb25b8c513bec10a99800680b Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 14:55:21 +0000
Subject: [PATCH 17/26] test(tracegrind): add multi-thread interleaved
 callstack regression test

Spawns 3 threads with distinct noinline call chains at different depths
(work_a->depth_a1->depth_a2, work_b->depth_b1, work_c->depth_c1->depth_c2)
to verify tracegrind correctly tracks per-thread ENTER_FN/EXIT_FN stacks.
Output is sorted by tid for deterministic comparison.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/tests/Makefile.am                  |  4 ++
 tracegrind/tests/test_thread_interleave.c     | 54 +++++++++++++++++++
 .../tests/test_thread_interleave.post.exp     | 33 ++++++++++++
 .../tests/test_thread_interleave.stderr.exp   |  6 +++
 .../tests/test_thread_interleave.vgtest       |  5 ++
 5 files changed, 102 insertions(+)
 create mode 100644 tracegrind/tests/test_thread_interleave.c
 create mode 100644 tracegrind/tests/test_thread_interleave.post.exp
 create mode 100644 tracegrind/tests/test_thread_interleave.stderr.exp
 create mode 100644 tracegrind/tests/test_thread_interleave.vgtest

diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
index e94b7d81e..0ad3b6ae5 100644
--- a/tracegrind/tests/Makefile.am
+++ b/tracegrind/tests/Makefile.am
@@ -21,6 +21,7 @@ check_PROGRAMS = \
 	test_tailcall.bin \
 	test_recursion.bin \
 	test_thread_create.bin \
+	test_thread_interleave.bin \
 	test_syscall.bin
 
 AM_CPPFLAGS += -I$(top_srcdir)/tracegrind
@@ -46,6 +47,8 @@ test_tailcall_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
 test_recursion_bin_SOURCES = test_recursion.c
 test_thread_create_bin_SOURCES = test_thread_create.c
 test_thread_create_bin_LDADD = -lpthread
+test_thread_interleave_bin_SOURCES = test_thread_interleave.c
+test_thread_interleave_bin_LDADD = -lpthread
 test_syscall_bin_SOURCES = test_syscall.c
 
 EXTRA_DIST = \
@@ -63,5 +66,6 @@ EXTRA_DIST = \
 	test_tailcall.vgtest test_tailcall.stderr.exp test_tailcall.post.exp \
 	test_recursion.vgtest test_recursion.stderr.exp test_recursion.post.exp \
 	test_thread_create.vgtest test_thread_create.stderr.exp test_thread_create.post.exp \
+	test_thread_interleave.vgtest test_thread_interleave.stderr.exp test_thread_interleave.post.exp \
 	test_syscall.vgtest test_syscall.stderr.exp test_syscall.post.exp \
 	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/test_thread_interleave.c b/tracegrind/tests/test_thread_interleave.c
new file mode 100644
index 000000000..94ef46f05
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.c
@@ -0,0 +1,54 @@
+#include "tracegrind.h"
+#include <pthread.h>
+
+__attribute__((noinline)) static void depth_a2(void) { }
+
+__attribute__((noinline)) static void depth_a1(void) {
+    depth_a2();
+}
+
+__attribute__((noinline)) static void *work_a(void *arg) {
+    (void)arg;
+    depth_a1();
+    return NULL;
+}
+
+__attribute__((noinline)) static void depth_b1(void) { }
+
+__attribute__((noinline)) static void *work_b(void *arg) {
+    (void)arg;
+    depth_b1();
+    return NULL;
+}
+
+__attribute__((noinline)) static void depth_c2(void) { }
+
+__attribute__((noinline)) static void depth_c1(void) {
+    depth_c2();
+}
+
+__attribute__((noinline)) static void *work_c(void *arg) {
+    (void)arg;
+    depth_c1();
+    return NULL;
+}
+
+int main(void) {
+    pthread_t t1, t2, t3;
+
+    TRACEGRIND_ADD_MARKER("start");
+    TRACEGRIND_START_INSTRUMENTATION;
+
+    pthread_create(&t1, NULL, work_a, NULL);
+    pthread_create(&t2, NULL, work_b, NULL);
+    pthread_create(&t3, NULL, work_c, NULL);
+
+    pthread_join(t1, NULL);
+    pthread_join(t2, NULL);
+    pthread_join(t3, NULL);
+
+    TRACEGRIND_STOP_INSTRUMENTATION;
+    TRACEGRIND_ADD_MARKER("end");
+
+    return 0;
+}
diff --git a/tracegrind/tests/test_thread_interleave.post.exp b/tracegrind/tests/test_thread_interleave.post.exp
new file mode 100644
index 000000000..906e77b66
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.post.exp
@@ -0,0 +1,33 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=3
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=4
+seq=N | tid=1 | event=MARKER | marker=end
+seq=N | tid=2 | event=ENTER_FN | fn=work_a | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=ENTER_FN | fn=depth_a1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=ENTER_FN | fn=depth_a2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=depth_a2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=depth_a1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=work_a | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=ENTER_FN | fn=work_b | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=ENTER_FN | fn=depth_b1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=EXIT_FN | fn=depth_b1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=EXIT_FN | fn=work_b | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=ENTER_FN | fn=work_c | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=ENTER_FN | fn=depth_c1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=ENTER_FN | fn=depth_c2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=EXIT_FN | fn=depth_c2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=EXIT_FN | fn=depth_c1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=EXIT_FN | fn=work_c | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
diff --git a/tracegrind/tests/test_thread_interleave.stderr.exp b/tracegrind/tests/test_thread_interleave.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_thread_interleave.vgtest b/tracegrind/tests/test_thread_interleave.vgtest
new file mode 100644
index 000000000..76a5ac075
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_thread_interleave.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_thread_interleave --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_interleave.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |\(|MARKER|THREAD_CREATE|fn=work_a |fn=work_b |fn=work_c |fn=depth_a1 |fn=depth_a2 |fn=depth_b1 |fn=depth_c1 |fn=depth_c2 )' | sort -t'|' -k2,2 -s
+cleanup: rm -f tracegrind.out.test_thread_interleave.msgpack.lz4

From 2a7fa278e5d39f6139cedfcc7728ddff280b07da Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 15:04:52 +0000
Subject: [PATCH 18/26] fix(tracegrind): update test_thread_create expected
 output for resolved file path

The expected output had file=??? but debug info now correctly resolves
the source file to test_thread_create.c.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/tests/test_thread_create.post.exp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tracegrind/tests/test_thread_create.post.exp b/tracegrind/tests/test_thread_create.post.exp
index c10af4718..91eef3fe8 100644
--- a/tracegrind/tests/test_thread_create.post.exp
+++ b/tracegrind/tests/test_thread_create.post.exp
@@ -12,6 +12,6 @@ Event Schemas (discriminated union):
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
-seq=N | tid=2 | event=ENTER_FN | fn=thread_fn | obj=test_thread_create.bin | file=??? | line=0 | Ir=N
-seq=N | tid=2 | event=EXIT_FN | fn=thread_fn | obj=test_thread_create.bin | file=??? | line=0 | Ir=N
+seq=N | tid=2 | event=ENTER_FN | fn=thread_fn | obj=test_thread_create.bin | file=test_thread_create.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=thread_fn | obj=test_thread_create.bin | file=test_thread_create.c | line=0 | Ir=N
 seq=N | tid=1 | event=MARKER | marker=end

From b6449a338dba63a6e8211ecd6b121f8b89c1ba9f Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 15:52:08 +0000
Subject: [PATCH 19/26] test: improved benchmarks

---
 bench/bench.py | 57 +++++++++++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/bench/bench.py b/bench/bench.py
index 06ff6b66f..d577a67ec 100755
--- a/bench/bench.py
+++ b/bench/bench.py
@@ -92,6 +92,13 @@ def runner(request):
     return request.config._valgrind_runner
 
 
+CACHE_SIM_OPTIONS = [
+    "--cache-sim=yes",
+    "--I1=32768,8,64",
+    "--D1=32768,8,64",
+    "--LL=8388608,16,64",
+]
+
 def pytest_generate_tests(metafunc):
     """Parametrize tests with valgrind configurations."""
     if "tool_and_args" in metafunc.fixturenames:
@@ -103,61 +110,56 @@ def pytest_generate_tests(metafunc):
         # Format: (tool, args, config_name)
         all_configs = [
             # Callgrind configurations
-            ("callgrind", ["--read-inline-info=no"], "callgrind/no-inline"),
-            ("callgrind", ["--read-inline-info=yes"], "callgrind/inline"),
+            ("callgrind", ["--read-inline-info=no"], "cg/no-inline"),
+            ("callgrind", ["--read-inline-info=yes"], "cg/inline"),
             (
                 "callgrind",
                 [
+                    *CACHE_SIM_OPTIONS,
                     "--trace-children=yes",
-                    "--cache-sim=yes",
-                    "--I1=32768,8,64",
-                    "--D1=32768,8,64",
-                    "--LL=8388608,16,64",
                     "--collect-systime=nsec",
                     "--compress-strings=no",
                     "--combine-dumps=yes",
                     "--dump-line=no",
                     "--read-inline-info=yes",
                 ],
-                "callgrind/full-with-inline",
+                "cg/full-inline",
             ),
             (
                 "callgrind",
                 [
+                    *CACHE_SIM_OPTIONS,
                     "--trace-children=yes",
-                    "--cache-sim=yes",
-                    "--I1=32768,8,64",
-                    "--D1=32768,8,64",
-                    "--LL=8388608,16,64",
                     "--collect-systime=nsec",
                     "--compress-strings=no",
                     "--combine-dumps=yes",
                     "--dump-line=no",
+                    "--read-inline-info=no",
                 ],
-                "callgrind/full-no-inline",
+                "cg/full-no-inline",
             ),
             # Tracegrind configurations (only available in codspeed fork)
-            ("tracegrind", [], "tracegrind/default"),
+            ("tracegrind", ["--read-inline-info=no"], "tg/no-inline"),
+            ("tracegrind", ["--read-inline-info=yes"], "tg/inline"),
             (
                 "tracegrind",
                 [
-                    "--cache-sim=yes",
-                    "--I1=32768,8,64",
-                    "--D1=32768,8,64",
-                    "--LL=8388608,16,64",
+                    *CACHE_SIM_OPTIONS,
+                    "--trace-children=yes",
+                    "--collect-systime=nsec",
+                    "--read-inline-info=no",
                 ],
-                "tracegrind/cache-sim",
+                "tg/full-no-inline",
             ),
             (
                 "tracegrind",
                 [
-                    "--cache-sim=yes",
-                    "--I1=32768,8,64",
-                    "--D1=32768,8,64",
-                    "--LL=8388608,16,64",
+                    *CACHE_SIM_OPTIONS,
+                    "--trace-children=yes",
                     "--collect-systime=nsec",
+                    "--read-inline-info=yes",
                 ],
-                "tracegrind/cache-sim+systime",
+                "tg/full-inline",
             ),
         ]
 
@@ -174,11 +176,14 @@ def pytest_generate_tests(metafunc):
         # If the valgrind version is from CodSpeed, we don't want to display the exact version
         # to allow comparison against older versions.
         if ".codspeed" in runner.valgrind_version:
-            runner.valgrind_version = "valgrind.codspeed"
+            runner.valgrind_version = "codspeed"
+        # Clean valgrind version names
+        else:
+            runner.valgrind_version.removeprefix("valgrind-")
 
         # Create test IDs with format: valgrind-version, command, config-name
         test_ids = [
-            f"{runner.valgrind_version}, {runner.cmd}, {config_name}"
+            f"{runner.valgrind_version}/{config_name}, {runner.cmd}"
             for _, _, config_name in configs
         ]
 
@@ -243,7 +248,7 @@ def pytest_configure(self, config):
             config._valgrind_runner = runner
 
     exit_code = pytest.main(
-        [__file__, "-v", "--codspeed", "--codspeed-warmup-time=0", "--codspeed-max-time=5"],
+        [__file__, "-v", "--codspeed", "--codspeed-warmup-time=0", "--codspeed-max-time=30"],
         plugins=[RunnerPlugin()],
     )
     if exit_code != 0 and exit_code != 5:

From c036d1fc3554e8a3356b9e8afd48748198762885 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 15:58:13 +0000
Subject: [PATCH 20/26] feat(tracegrind): add counter_units map to output
 header

Record the unit of time-based event counters (sysTime, sysCpuTime) in
the schema chunk so consumers can interpret values without out-of-band
knowledge of the --collect-systime setting. The map is extensible for
future counters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/docs/tracegrind-msgpack-format.md | 16 +++++++++++++
 tracegrind/dump.c                            | 25 +++++++++++++++++++-
 tracegrind/scripts/tracegrind-analyzer       |  3 +++
 tracegrind/tests/test_syscall.post.exp       |  1 +
 tracegrind/tests/test_syscall.vgtest         |  2 +-
 5 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index 1cfd94da0..ca3a4be09 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -53,6 +53,10 @@ The first chunk contains a MsgPack map describing the discriminated union schema
         "4": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
         "5": ["seq", "tid", "event", "child_pid"],
         "6": ["seq", "tid", "event", "child_tid"]
+    },
+    "counter_units": {
+        "sysTime": "ns",
+        "sysCpuTime": "ns"
     }
 }
 ```
@@ -132,6 +136,18 @@ For ENTER_FN/EXIT_FN/ENTER_INLINED_FN/EXIT_INLINED_FN rows, event counters appea
 
 `Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`
 
+### Counter Units
+
+The `counter_units` field is a map from event counter name to its unit string. Only time-based counters are listed; counters absent from the map are dimensionless.
+
+| `--collect-systime` | Entries in `counter_units` |
+|---------------------|--------------------|
+| `msec`              | `"sysTime": "ms"` |
+| `usec`              | `"sysTime": "us"` |
+| `nsec`              | `"sysTime": "ns"`, `"sysCpuTime": "ns"` |
+
+When `--collect-systime` is not set, the `counter_units` map is empty.
+
 ## Data Chunks
 
 Each data chunk contains concatenated MsgPack arrays. The row format depends on the event type (index 2):
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index da03d1c1f..f8313f938 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -112,7 +112,7 @@ static void msgpack_write_header(void)
     msgpack_init(&hdr, 2048);
 
     /* Header is a map with metadata */
-    msgpack_write_map_header(&hdr, 5);
+    msgpack_write_map_header(&hdr, 6);
 
     /* version */
     msgpack_write_key(&hdr, "version");
@@ -186,6 +186,29 @@ static void msgpack_write_header(void)
     msgpack_write_str(&hdr, "event", -1);
     msgpack_write_str(&hdr, "child_tid", -1);
 
+    /* counter_units - map from counter name to unit string.
+       Following callgrind's convention: only time counters get units. */
+    msgpack_write_key(&hdr, "counter_units");
+    {
+        Int n_units = 0;
+        const HChar* unit_str = NULL;
+        switch (TG_(clo).collect_systime) {
+            case systime_no:   break;
+            case systime_msec: unit_str = "ms"; n_units = 1; break;
+            case systime_usec: unit_str = "us"; n_units = 1; break;
+            case systime_nsec: unit_str = "ns"; n_units = 2; break;
+        }
+        msgpack_write_map_header(&hdr, n_units);
+        if (unit_str) {
+            msgpack_write_key(&hdr, "sysTime");
+            msgpack_write_str(&hdr, unit_str, -1);
+            if (TG_(clo).collect_systime == systime_nsec) {
+                msgpack_write_key(&hdr, "sysCpuTime");
+                msgpack_write_str(&hdr, unit_str, -1);
+            }
+        }
+    }
+
     /* Compress and write header chunk */
     SizeT src_size = hdr.size;
     SizeT dst_capacity = tg_lz4_compress_bound(src_size);
diff --git a/tracegrind/scripts/tracegrind-analyzer b/tracegrind/scripts/tracegrind-analyzer
index 93b4d249f..67ebbba8d 100755
--- a/tracegrind/scripts/tracegrind-analyzer
+++ b/tracegrind/scripts/tracegrind-analyzer
@@ -136,6 +136,9 @@ def print_schema(schema: Dict[str, Any], version: int) -> None:
             print(f"  {event_type} ({event_name}): {columns}")
     elif 'columns' in schema:
         print(f"Columns: {schema['columns']}")
+
+    if schema.get('counter_units'):
+        print(f"Counter Units: {dict(sorted(schema['counter_units'].items()))}")
     print()
 
 
diff --git a/tracegrind/tests/test_syscall.post.exp b/tracegrind/tests/test_syscall.post.exp
index 5bb4734e3..b4ff2479f 100644
--- a/tracegrind/tests/test_syscall.post.exp
+++ b/tracegrind/tests/test_syscall.post.exp
@@ -9,6 +9,7 @@ Event Schemas (discriminated union):
   4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counter Units: {'sysCpuTime': 'ns', 'sysTime': 'ns'}
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
diff --git a/tracegrind/tests/test_syscall.vgtest b/tracegrind/tests/test_syscall.vgtest
index e60251069..d67cf88d7 100644
--- a/tracegrind/tests/test_syscall.vgtest
+++ b/tracegrind/tests/test_syscall.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_syscall.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_syscall --instr-atstart=no --collect-systime=nsec
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counter Units:|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
 cleanup: rm -f tracegrind.out.test_syscall.msgpack.lz4

From 515925bef1b24054ce710fddcfa12d27a6a12d54 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 16:18:26 +0000
Subject: [PATCH 21/26] ci: refine benchmark timeout

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8a4ae6db7..97e933807 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,7 @@ on:
 
 jobs:
   test:
+    timeout-minutes: 30
     strategy:
       matrix:
         runner:

From 91d6e858e399f09b6c1c471fe15281cc07d1ba0c Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sat, 7 Feb 2026 17:05:48 +0000
Subject: [PATCH 22/26] fix(tracegrind): stop auto-appending .msgpack.lz4
 extension to output file

The output file path is now used exactly as specified by --tracegrind-out-file.
The default format includes the extension so the default behavior is unchanged.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/clo.c                               | 2 +-
 tracegrind/docs/tracegrind-msgpack-format.md   | 2 +-
 tracegrind/dump.c                              | 7 -------
 tracegrind/global.h                            | 2 +-
 tracegrind/tests/test_basic.vgtest             | 2 +-
 tracegrind/tests/test_enter_inlined.vgtest     | 2 +-
 tracegrind/tests/test_exception.vgtest         | 2 +-
 tracegrind/tests/test_foo_bar_baz.vgtest       | 2 +-
 tracegrind/tests/test_inline.vgtest            | 2 +-
 tracegrind/tests/test_instr_toggle.vgtest      | 2 +-
 tracegrind/tests/test_longjmp.vgtest           | 2 +-
 tracegrind/tests/test_marker.vgtest            | 2 +-
 tracegrind/tests/test_nested_inlined.vgtest    | 2 +-
 tracegrind/tests/test_recursion.vgtest         | 2 +-
 tracegrind/tests/test_schema.vgtest            | 2 +-
 tracegrind/tests/test_signal.vgtest            | 2 +-
 tracegrind/tests/test_syscall.vgtest           | 2 +-
 tracegrind/tests/test_tailcall.vgtest          | 2 +-
 tracegrind/tests/test_thread_create.vgtest     | 2 +-
 tracegrind/tests/test_thread_interleave.vgtest | 2 +-
 tracegrind/tests/test_toggle_collect.vgtest    | 2 +-
 21 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index 3bd96697e..9662016fb 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -505,7 +505,7 @@ void TG_(print_usage)(void)
 {
    VG_(printf)(
 "\n   output options:\n"
-"    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p]\n"
+"    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p.msgpack.lz4]\n"
 
 "\n   data collection options:\n"
 "    --instr-atstart=no|yes    Do instrumentation at tracegrind start [yes]\n"
diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index ca3a4be09..6a7f3466b 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-Tracegrind produces a binary trace file combining MsgPack serialization with LZ4 block compression. Files use the `.msgpack.lz4` extension.
+Tracegrind produces a binary trace file combining MsgPack serialization with LZ4 block compression. The default output file name is `tracegrind.out.<pid>.msgpack.lz4`.
 
 ## File Structure
 
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index f8313f938..ba02dd64b 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -406,13 +406,6 @@ void TG_(trace_open_output)(void)
     filename[sizeof(filename) - 1] = '\0';
     VG_(free)(expanded);
 
-    /* Append .msgpack.lz4 suffix */
-    SizeT len = VG_(strlen)(filename);
-    if (len + 12 < sizeof(filename)) {
-        VG_(strncpy)(filename + len, ".msgpack.lz4", sizeof(filename) - len - 1);
-        filename[sizeof(filename) - 1] = '\0';
-    }
-
     res = VG_(open)(filename,
                     VKI_O_CREAT|VKI_O_WRONLY|VKI_O_TRUNC,
                     VKI_S_IRUSR|VKI_S_IWUSR);
diff --git a/tracegrind/global.h b/tracegrind/global.h
index f0312ed6b..86b883a89 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -67,7 +67,7 @@
 /*--- Command line options                                 ---*/
 /*------------------------------------------------------------*/
 
-#define DEFAULT_OUTFORMAT   "tracegrind.out.%p"
+#define DEFAULT_OUTFORMAT   "tracegrind.out.%p.msgpack.lz4"
 
 /* If and how to collect syscall time.
    systime_no : do not collect systime
diff --git a/tracegrind/tests/test_basic.vgtest b/tracegrind/tests/test_basic.vgtest
index 5c95483ea..4f2a05cd8 100644
--- a/tracegrind/tests/test_basic.vgtest
+++ b/tracegrind/tests/test_basic.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_basic.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_basic
+vgopts: --tracegrind-out-file=tracegrind.out.test_basic.msgpack.lz4
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_basic.msgpack.lz4 --stats | ./filter_trace
 cleanup: rm -f tracegrind.out.test_basic.msgpack.lz4
diff --git a/tracegrind/tests/test_enter_inlined.vgtest b/tracegrind/tests/test_enter_inlined.vgtest
index e8d628355..1b5d7c55d 100644
--- a/tracegrind/tests/test_enter_inlined.vgtest
+++ b/tracegrind/tests/test_enter_inlined.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_enter_inlined.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_enter_inlined --instr-atstart=no --read-inline-info=yes
+vgopts: --tracegrind-out-file=tracegrind.out.test_enter_inlined.msgpack.lz4 --instr-atstart=no --read-inline-info=yes
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_enter_inlined.msgpack.lz4 | ./filter_trace
 cleanup: rm -f tracegrind.out.test_enter_inlined.msgpack.lz4
diff --git a/tracegrind/tests/test_exception.vgtest b/tracegrind/tests/test_exception.vgtest
index 3c098a799..e707c330a 100644
--- a/tracegrind/tests/test_exception.vgtest
+++ b/tracegrind/tests/test_exception.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_exception.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_exception --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_exception.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_exception.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=catcher|fn=thrower|fn=do_throw)'
 cleanup: rm -f tracegrind.out.test_exception.msgpack.lz4
diff --git a/tracegrind/tests/test_foo_bar_baz.vgtest b/tracegrind/tests/test_foo_bar_baz.vgtest
index c1cfaeefe..c2a7b3efb 100644
--- a/tracegrind/tests/test_foo_bar_baz.vgtest
+++ b/tracegrind/tests/test_foo_bar_baz.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_foo_bar_baz.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_foo_bar_baz --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_foo_bar_baz.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_foo_bar_baz.msgpack.lz4 | ./filter_trace
 cleanup: rm -f tracegrind.out.test_foo_bar_baz.msgpack.lz4
diff --git a/tracegrind/tests/test_inline.vgtest b/tracegrind/tests/test_inline.vgtest
index f6ab09838..5c96843d2 100644
--- a/tracegrind/tests/test_inline.vgtest
+++ b/tracegrind/tests/test_inline.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_inline.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_inline --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_inline.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_inline.msgpack.lz4 | ./filter_trace
 cleanup: rm -f tracegrind.out.test_inline.msgpack.lz4
diff --git a/tracegrind/tests/test_instr_toggle.vgtest b/tracegrind/tests/test_instr_toggle.vgtest
index 75adf7aba..3247a09e4 100644
--- a/tracegrind/tests/test_instr_toggle.vgtest
+++ b/tracegrind/tests/test_instr_toggle.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_instr_toggle.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_instr_toggle --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_instr_toggle.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_instr_toggle.msgpack.lz4 | ./filter_trace
 cleanup: rm -f tracegrind.out.test_instr_toggle.msgpack.lz4
diff --git a/tracegrind/tests/test_longjmp.vgtest b/tracegrind/tests/test_longjmp.vgtest
index 2ee68b2af..6c438fb85 100644
--- a/tracegrind/tests/test_longjmp.vgtest
+++ b/tracegrind/tests/test_longjmp.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_longjmp.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_longjmp --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_longjmp.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_longjmp.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=outer|fn=middle|fn=inner)'
 cleanup: rm -f tracegrind.out.test_longjmp.msgpack.lz4
diff --git a/tracegrind/tests/test_marker.vgtest b/tracegrind/tests/test_marker.vgtest
index fe3b45a0a..9165191e0 100644
--- a/tracegrind/tests/test_marker.vgtest
+++ b/tracegrind/tests/test_marker.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_marker.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_marker
+vgopts: --tracegrind-out-file=tracegrind.out.test_marker.msgpack.lz4
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_marker.msgpack.lz4 --event MARKER --raw | ./filter_trace
 cleanup: rm -f tracegrind.out.test_marker.msgpack.lz4
diff --git a/tracegrind/tests/test_nested_inlined.vgtest b/tracegrind/tests/test_nested_inlined.vgtest
index ff512078c..adaf9a895 100644
--- a/tracegrind/tests/test_nested_inlined.vgtest
+++ b/tracegrind/tests/test_nested_inlined.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_nested_inlined.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_nested_inlined --instr-atstart=no --read-inline-info=yes
+vgopts: --tracegrind-out-file=tracegrind.out.test_nested_inlined.msgpack.lz4 --instr-atstart=no --read-inline-info=yes
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_nested_inlined.msgpack.lz4 | ./filter_trace
 cleanup: rm -f tracegrind.out.test_nested_inlined.msgpack.lz4
diff --git a/tracegrind/tests/test_recursion.vgtest b/tracegrind/tests/test_recursion.vgtest
index a675bd483..bfff7defe 100644
--- a/tracegrind/tests/test_recursion.vgtest
+++ b/tracegrind/tests/test_recursion.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_recursion.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_recursion --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_recursion.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_recursion.msgpack.lz4 | ./filter_trace | grep -E '(event=MARKER|fn=recurse)' | awk '/MARKER/{print} /ENTER_FN/{e++} /EXIT_FN/{x++} END{print "ENTER_FN count: "e; print "EXIT_FN count: "x}'
 cleanup: rm -f tracegrind.out.test_recursion.msgpack.lz4
diff --git a/tracegrind/tests/test_schema.vgtest b/tracegrind/tests/test_schema.vgtest
index 4f96bd7df..482a552de 100644
--- a/tracegrind/tests/test_schema.vgtest
+++ b/tracegrind/tests/test_schema.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_basic.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_schema
+vgopts: --tracegrind-out-file=tracegrind.out.test_schema.msgpack.lz4
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_schema.msgpack.lz4 --schema | ./filter_trace
 cleanup: rm -f tracegrind.out.test_schema.msgpack.lz4
diff --git a/tracegrind/tests/test_signal.vgtest b/tracegrind/tests/test_signal.vgtest
index a61fbd7bd..06b87f70c 100644
--- a/tracegrind/tests/test_signal.vgtest
+++ b/tracegrind/tests/test_signal.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_signal.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_signal --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_signal.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_signal.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=caller|fn=handler_fn)'
 cleanup: rm -f tracegrind.out.test_signal.msgpack.lz4
diff --git a/tracegrind/tests/test_syscall.vgtest b/tracegrind/tests/test_syscall.vgtest
index d67cf88d7..d4a57deec 100644
--- a/tracegrind/tests/test_syscall.vgtest
+++ b/tracegrind/tests/test_syscall.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_syscall.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_syscall --instr-atstart=no --collect-systime=nsec
+vgopts: --tracegrind-out-file=tracegrind.out.test_syscall.msgpack.lz4 --instr-atstart=no --collect-systime=nsec
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counter Units:|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
 cleanup: rm -f tracegrind.out.test_syscall.msgpack.lz4
diff --git a/tracegrind/tests/test_tailcall.vgtest b/tracegrind/tests/test_tailcall.vgtest
index f954ada67..b971e1de7 100644
--- a/tracegrind/tests/test_tailcall.vgtest
+++ b/tracegrind/tests/test_tailcall.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_tailcall.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_tailcall --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_tailcall.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_tailcall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=chain_)'
 cleanup: rm -f tracegrind.out.test_tailcall.msgpack.lz4
diff --git a/tracegrind/tests/test_thread_create.vgtest b/tracegrind/tests/test_thread_create.vgtest
index 1c47b1e23..3c6ff7704 100644
--- a/tracegrind/tests/test_thread_create.vgtest
+++ b/tracegrind/tests/test_thread_create.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_thread_create.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_thread_create --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_thread_create.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_create.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|MARKER|THREAD_CREATE|FORK|thread_fn|Showing |\()' | sed 's/child_pid=[0-9]*/child_pid=N/'
 cleanup: rm -f tracegrind.out.test_thread_create.msgpack.lz4
diff --git a/tracegrind/tests/test_thread_interleave.vgtest b/tracegrind/tests/test_thread_interleave.vgtest
index 76a5ac075..ab3ad0948 100644
--- a/tracegrind/tests/test_thread_interleave.vgtest
+++ b/tracegrind/tests/test_thread_interleave.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_thread_interleave.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_thread_interleave --instr-atstart=no
+vgopts: --tracegrind-out-file=tracegrind.out.test_thread_interleave.msgpack.lz4 --instr-atstart=no
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_interleave.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |\(|MARKER|THREAD_CREATE|fn=work_a |fn=work_b |fn=work_c |fn=depth_a1 |fn=depth_a2 |fn=depth_b1 |fn=depth_c1 |fn=depth_c2 )' | sort -t'|' -k2,2 -s
 cleanup: rm -f tracegrind.out.test_thread_interleave.msgpack.lz4
diff --git a/tracegrind/tests/test_toggle_collect.vgtest b/tracegrind/tests/test_toggle_collect.vgtest
index a0178f7eb..0f1123dfb 100644
--- a/tracegrind/tests/test_toggle_collect.vgtest
+++ b/tracegrind/tests/test_toggle_collect.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_toggle_collect.bin
-vgopts: --tracegrind-out-file=tracegrind.out.test_toggle_collect
+vgopts: --tracegrind-out-file=tracegrind.out.test_toggle_collect.msgpack.lz4
 post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_toggle_collect.msgpack.lz4 --stats | ./filter_trace
 cleanup: rm -f tracegrind.out.test_toggle_collect.msgpack.lz4

From 4c7b519885559f59970d17ae13ce49750eaf3f0d Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sun, 8 Feb 2026 11:37:14 +0000
Subject: [PATCH 23/26] feat(tracegrind): bump output format from v3 to v4 with
 top-level counters array

Extract counter column names from inline event schemas into a separate
top-level `counters` field and nest counter deltas as a sub-array within
fn-call data rows. This makes the schema self-describing for counter
layout without repeating counter names in every event type.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/docs/tracegrind-msgpack-format.md  | 98 ++++++++++---------
 tracegrind/dump.c                             | 67 ++++++-------
 tracegrind/scripts/tracegrind-analyzer        | 12 +++
 tracegrind/tests/test_basic.post.exp          |  9 +-
 tracegrind/tests/test_enter_inlined.post.exp  |  9 +-
 tracegrind/tests/test_exception.post.exp      |  1 +
 tracegrind/tests/test_exception.vgtest        |  2 +-
 tracegrind/tests/test_foo_bar_baz.post.exp    |  9 +-
 tracegrind/tests/test_inline.post.exp         |  9 +-
 tracegrind/tests/test_instr_toggle.post.exp   |  9 +-
 tracegrind/tests/test_longjmp.post.exp        |  1 +
 tracegrind/tests/test_longjmp.vgtest          |  2 +-
 tracegrind/tests/test_marker.post.exp         |  9 +-
 tracegrind/tests/test_nested_inlined.post.exp |  9 +-
 tracegrind/tests/test_schema.post.exp         |  9 +-
 tracegrind/tests/test_signal.post.exp         |  1 +
 tracegrind/tests/test_signal.vgtest           |  2 +-
 tracegrind/tests/test_syscall.post.exp        |  9 +-
 tracegrind/tests/test_syscall.vgtest          |  2 +-
 tracegrind/tests/test_tailcall.post.exp       |  1 +
 tracegrind/tests/test_tailcall.vgtest         |  2 +-
 tracegrind/tests/test_thread_create.post.exp  |  9 +-
 tracegrind/tests/test_thread_create.vgtest    |  2 +-
 .../tests/test_thread_interleave.post.exp     |  9 +-
 .../tests/test_thread_interleave.vgtest       |  2 +-
 tracegrind/tests/test_toggle_collect.post.exp |  9 +-
 26 files changed, 171 insertions(+), 132 deletions(-)

diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
index 6a7f3466b..f6dabeb31 100644
--- a/tracegrind/docs/tracegrind-msgpack-format.md
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -23,7 +23,7 @@ Tracegrind produces a binary trace file combining MsgPack serialization with LZ4
 | Offset | Size | Field   | Description |
 |--------|------|---------|-------------|
 | 0      | 4    | magic   | ASCII `TGMP` (0x54 0x47 0x4D 0x50) |
-| 4      | 4    | version | Format version, uint32 LE (currently 3) |
+| 4      | 4    | version | Format version, uint32 LE (currently 4) |
 
 ## Chunk Format
 
@@ -41,19 +41,20 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 
 ```json
 {
-    "version": 3,
+    "version": 4,
     "format": "tracegrind-msgpack",
     "creator": "valgrind-tracegrind",
     "creator_version": "3.26.0.codspeed",
     "event_schemas": {
         "0": ["seq", "tid", "event", "marker"],
-        "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
-        "2": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
-        "3": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
-        "4": ["seq", "tid", "event", "fn", "obj", "file", "line", "Ir", ...],
+        "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "2": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "3": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "4": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
         "5": ["seq", "tid", "event", "child_pid"],
         "6": ["seq", "tid", "event", "child_tid"]
     },
+    "counters": ["Ir"],
     "counter_units": {
         "sysTime": "ns",
         "sysCpuTime": "ns"
@@ -61,6 +62,8 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 }
 ```
 
+The `counters` array lists the dynamic counter column names (e.g. `["Ir"]` or `["Ir", "sysCount", "sysTime", "sysCpuTime"]`). Event schemas for types 1-4 use `"counters"` as a sentinel at index 7 to indicate that a sub-array of counter deltas appears at that position in data rows.
+
 ### Event Types
 
 | Type | Name   | Description |
@@ -86,31 +89,31 @@ The first chunk contains a MsgPack map describing the discriminated union schema
 
 **ENTER_FN/EXIT_FN rows (event 1, 2):**
 
-| Index | Name  | Type   | Description |
-|-------|-------|--------|-------------|
-| 0     | seq   | uint64 | Sequence number |
-| 1     | tid   | int32  | Thread ID |
-| 2     | event | int    | 1 = ENTER_FN, 2 = EXIT_FN |
-| 3     | fn    | string | Function name |
-| 4     | obj   | string | Shared object path |
-| 5     | file  | string | Source file path |
-| 6     | line  | int32  | Line number (0 if unknown) |
-| 7+    | ...   | int64  | Event counter deltas (Ir, Dr, Dw, etc.) |
+| Index | Name     | Type   | Description |
+|-------|----------|--------|-------------|
+| 0     | seq      | uint64 | Sequence number |
+| 1     | tid      | int32  | Thread ID |
+| 2     | event    | int    | 1 = ENTER_FN, 2 = EXIT_FN |
+| 3     | fn       | string | Function name |
+| 4     | obj      | string | Shared object path |
+| 5     | file     | string | Source file path |
+| 6     | line     | int32  | Line number (0 if unknown) |
+| 7     | counters | array  | Counter deltas sub-array (Ir, Dr, Dw, etc.) |
 
 **ENTER_INLINED_FN/EXIT_INLINED_FN rows (event 3, 4):**
 
 Same schema as ENTER_FN/EXIT_FN rows.
 
-| Index | Name  | Type   | Description |
-|-------|-------|--------|-------------|
-| 0     | seq   | uint64 | Sequence number |
-| 1     | tid   | int32  | Thread ID |
-| 2     | event | int    | 3 = ENTER_INLINED_FN, 4 = EXIT_INLINED_FN |
-| 3     | fn    | string | Function name |
-| 4     | obj   | string | Shared object path |
-| 5     | file  | string | Source file path |
-| 6     | line  | int32  | Line number (0 if unknown) |
-| 7+    | ...   | int64  | Event counter deltas (Ir, Dr, Dw, etc.) |
+| Index | Name     | Type   | Description |
+|-------|----------|--------|-------------|
+| 0     | seq      | uint64 | Sequence number |
+| 1     | tid      | int32  | Thread ID |
+| 2     | event    | int    | 3 = ENTER_INLINED_FN, 4 = EXIT_INLINED_FN |
+| 3     | fn       | string | Function name |
+| 4     | obj      | string | Shared object path |
+| 5     | file     | string | Source file path |
+| 6     | line     | int32  | Line number (0 if unknown) |
+| 7     | counters | array  | Counter deltas sub-array (Ir, Dr, Dw, etc.) |
 
 **FORK rows (event 5):**
 
@@ -132,7 +135,7 @@ Same schema as ENTER_FN/EXIT_FN rows.
 
 ### Event Counter Columns
 
-For ENTER_FN/EXIT_FN/ENTER_INLINED_FN/EXIT_INLINED_FN rows, event counters appear as delta values starting at index 7. Which counters are present depends on Tracegrind options:
+For ENTER_FN/EXIT_FN/ENTER_INLINED_FN/EXIT_INLINED_FN rows, event counters appear as a sub-array at index 7. The order of values in the sub-array corresponds to the top-level `counters` array in the schema. Which counters are present depends on Tracegrind options:
 
 `Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`
 
@@ -153,13 +156,13 @@ When `--collect-systime` is not set, the `counter_units` map is empty.
 Each data chunk contains concatenated MsgPack arrays. The row format depends on the event type (index 2):
 
 ```
-[seq, tid, 0, marker]                               # MARKER
-[seq, tid, 1, fn, obj, file, line, delta_Ir, ...]   # ENTER_FN
-[seq, tid, 2, fn, obj, file, line, delta_Ir, ...]   # EXIT_FN
-[seq, tid, 3, fn, obj, file, line, delta_Ir, ...]   # ENTER_INLINED_FN
-[seq, tid, 4, fn, obj, file, line, delta_Ir, ...]   # EXIT_INLINED_FN
-[seq, tid, 5, child_pid]                             # FORK
-[seq, tid, 6, child_tid]                             # THREAD_CREATE
+[seq, tid, 0, marker]                                    # MARKER
+[seq, tid, 1, fn, obj, file, line, [delta_Ir, ...]]      # ENTER_FN
+[seq, tid, 2, fn, obj, file, line, [delta_Ir, ...]]      # EXIT_FN
+[seq, tid, 3, fn, obj, file, line, [delta_Ir, ...]]      # ENTER_INLINED_FN
+[seq, tid, 4, fn, obj, file, line, [delta_Ir, ...]]      # EXIT_INLINED_FN
+[seq, tid, 5, child_pid]                                  # FORK
+[seq, tid, 6, child_tid]                                  # THREAD_CREATE
 ```
 
 The reference implementation writes 4096 rows per chunk.
@@ -177,16 +180,15 @@ def read_tracegrind(filepath):
     with open(filepath, 'rb') as f:
         assert f.read(4) == b'TGMP'
         version = struct.unpack('<I', f.read(4))[0]
-        assert version == 3
+        assert version == 4
 
         # Read schema chunk
         usize, csize = struct.unpack('<II', f.read(8))
         schema = msgpack.unpackb(
-            lz4.block.decompress(f.read(csize), uncompressed_size=usize))
-        event_schemas = {
-            int(k): [c.decode() if isinstance(c, bytes) else c for c in v]
-            for k, v in schema[b'event_schemas'].items()
-        }
+            lz4.block.decompress(f.read(csize), uncompressed_size=usize),
+            raw=False)
+        event_schemas = schema['event_schemas']
+        counter_names = schema['counters']
 
         # Read data chunks
         rows = []
@@ -199,10 +201,18 @@ def read_tracegrind(filepath):
             unpacker.feed(chunk)
             for row in unpacker:
                 event_type = row[2]
-                columns = event_schemas[event_type]
-                rows.append(dict(zip(columns, row)))
-
-        return event_schemas, rows
+                columns = event_schemas[str(event_type)]
+                record = {}
+                for i, col in enumerate(columns):
+                    if col == 'counters' and isinstance(row[i], list):
+                        # Expand counters sub-array
+                        for j, name in enumerate(counter_names):
+                            record[name] = row[i][j]
+                    else:
+                        record[col] = row[i]
+                rows.append(record)
+
+        return event_schemas, counter_names, rows
 ```
 
 ## References
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index ba02dd64b..540d64a3b 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -112,11 +112,11 @@ static void msgpack_write_header(void)
     msgpack_init(&hdr, 2048);
 
     /* Header is a map with metadata */
-    msgpack_write_map_header(&hdr, 6);
+    msgpack_write_map_header(&hdr, 7);
 
     /* version */
     msgpack_write_key(&hdr, "version");
-    msgpack_write_uint(&hdr, 3);
+    msgpack_write_uint(&hdr, 4);
 
     /* format */
     msgpack_write_key(&hdr, "format");
@@ -142,32 +142,22 @@ static void msgpack_write_header(void)
     msgpack_write_str(&hdr, "event", -1);
     msgpack_write_str(&hdr, "marker", -1);
 
-    /* Event type 1 (ENTER_FN) schema */
-    msgpack_write_key(&hdr, "1");
-    msgpack_write_array_header(&hdr, mp_state.ncols);
-    for (Int i = 0; i < mp_state.ncols; i++) {
-        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
-    }
-
-    /* Event type 2 (EXIT_FN) schema - same as ENTER_FN */
-    msgpack_write_key(&hdr, "2");
-    msgpack_write_array_header(&hdr, mp_state.ncols);
-    for (Int i = 0; i < mp_state.ncols; i++) {
-        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
-    }
-
-    /* Event type 3 (ENTER_INLINED_FN) schema - same columns as ENTER_FN/EXIT_FN */
-    msgpack_write_key(&hdr, "3");
-    msgpack_write_array_header(&hdr, mp_state.ncols);
-    for (Int i = 0; i < mp_state.ncols; i++) {
-        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
-    }
-
-    /* Event type 4 (EXIT_INLINED_FN) schema - same columns as ENTER_FN/EXIT_FN */
-    msgpack_write_key(&hdr, "4");
-    msgpack_write_array_header(&hdr, mp_state.ncols);
-    for (Int i = 0; i < mp_state.ncols; i++) {
-        msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+    /* Event types 1-4: 7 fixed columns + "counters" sentinel */
+    {
+        const HChar* ev_keys[] = {"1", "2", "3", "4"};
+        Int k;
+        for (k = 0; k < 4; k++) {
+            msgpack_write_key(&hdr, ev_keys[k]);
+            msgpack_write_array_header(&hdr, 8);
+            msgpack_write_str(&hdr, "seq", -1);
+            msgpack_write_str(&hdr, "tid", -1);
+            msgpack_write_str(&hdr, "event", -1);
+            msgpack_write_str(&hdr, "fn", -1);
+            msgpack_write_str(&hdr, "obj", -1);
+            msgpack_write_str(&hdr, "file", -1);
+            msgpack_write_str(&hdr, "line", -1);
+            msgpack_write_str(&hdr, "counters", -1);
+        }
     }
 
     /* Event type 5 (FORK) schema */
@@ -186,6 +176,16 @@ static void msgpack_write_header(void)
     msgpack_write_str(&hdr, "event", -1);
     msgpack_write_str(&hdr, "child_tid", -1);
 
+    /* counters - array of dynamic counter column names */
+    msgpack_write_key(&hdr, "counters");
+    msgpack_write_array_header(&hdr, mp_state.n_event_cols);
+    {
+        Int i;
+        for (i = 7; i < mp_state.ncols; i++) {
+            msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+        }
+    }
+
     /* counter_units - map from counter name to unit string.
        Following callgrind's convention: only time counters get units. */
     msgpack_write_key(&hdr, "counter_units");
@@ -217,8 +217,8 @@ static void msgpack_write_header(void)
     SizeT compressed_size = tg_lz4_compress(
         compressed, dst_capacity, hdr.data, src_size);
 
-    /* Magic + version (8 bytes): "TGMP" + version(4) - version 3 */
-    UChar magic[8] = {'T', 'G', 'M', 'P', 0x03, 0x00, 0x00, 0x00};
+    /* Magic + version (8 bytes): "TGMP" + version(4) - version 4 */
+    UChar magic[8] = {'T', 'G', 'M', 'P', 0x04, 0x00, 0x00, 0x00};
     VG_(write)(TG_(trace_out).fd, magic, 8);
 
     /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
@@ -296,8 +296,8 @@ static void msgpack_add_row(ULong seq, Int tid, Int event,
                             const HChar* file_name, Int line,
                             const ULong* deltas, Int n_deltas)
 {
-    /* Each row is a msgpack array */
-    msgpack_write_array_header(&mp_state.buf, mp_state.ncols);
+    /* Each row is a msgpack array: 7 fixed + 1 counters sub-array */
+    msgpack_write_array_header(&mp_state.buf, 8);
 
     /* Fixed columns */
     msgpack_write_uint(&mp_state.buf, seq);
@@ -308,7 +308,8 @@ static void msgpack_add_row(ULong seq, Int tid, Int event,
     msgpack_write_str(&mp_state.buf, file_name, -1);
     msgpack_write_int(&mp_state.buf, line);
 
-    /* Event delta columns */
+    /* Counters sub-array */
+    msgpack_write_array_header(&mp_state.buf, n_deltas);
     for (Int i = 0; i < n_deltas; i++) {
         msgpack_write_uint(&mp_state.buf, deltas[i]);
     }
diff --git a/tracegrind/scripts/tracegrind-analyzer b/tracegrind/scripts/tracegrind-analyzer
index 67ebbba8d..185ed5d2d 100755
--- a/tracegrind/scripts/tracegrind-analyzer
+++ b/tracegrind/scripts/tracegrind-analyzer
@@ -108,12 +108,21 @@ def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]:
         # Fallback for old format with 'columns' key
         columns = schema.get('columns', [])
 
+    counter_names = schema.get('counters', [])
+
     result = {}
     for i, val in enumerate(row):
         if i < len(columns):
             key = columns[i]
             if key == 'event':
                 result[key] = get_event_name(val)
+            elif key == 'counters' and isinstance(val, list):
+                # Expand counters sub-array using top-level counter names
+                for j, cval in enumerate(val):
+                    if j < len(counter_names):
+                        result[counter_names[j]] = cval
+                    else:
+                        result[f'_counter{j}'] = cval
             else:
                 result[key] = val
         else:
@@ -137,6 +146,9 @@ def print_schema(schema: Dict[str, Any], version: int) -> None:
     elif 'columns' in schema:
         print(f"Columns: {schema['columns']}")
 
+    if schema.get('counters'):
+        print(f"Counters: {schema['counters']}")
+
     if schema.get('counter_units'):
         print(f"Counter Units: {dict(sorted(schema['counter_units'].items()))}")
     print()
diff --git a/tracegrind/tests/test_basic.post.exp b/tracegrind/tests/test_basic.post.exp
index c1c546e90..19397d9bc 100644
--- a/tracegrind/tests/test_basic.post.exp
+++ b/tracegrind/tests/test_basic.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Total rows: N
 
diff --git a/tracegrind/tests/test_enter_inlined.post.exp b/tracegrind/tests/test_enter_inlined.post.exp
index 2af0e323f..f63eb2906 100644
--- a/tracegrind/tests/test_enter_inlined.post.exp
+++ b/tracegrind/tests/test_enter_inlined.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
diff --git a/tracegrind/tests/test_exception.post.exp b/tracegrind/tests/test_exception.post.exp
index 7f57de18e..7089c29d3 100644
--- a/tracegrind/tests/test_exception.post.exp
+++ b/tracegrind/tests/test_exception.post.exp
@@ -3,6 +3,7 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=ENTER_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
diff --git a/tracegrind/tests/test_exception.vgtest b/tracegrind/tests/test_exception.vgtest
index e707c330a..1567cd9d2 100644
--- a/tracegrind/tests/test_exception.vgtest
+++ b/tracegrind/tests/test_exception.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_exception.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_exception.msgpack.lz4 --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_exception.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=catcher|fn=thrower|fn=do_throw)'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_exception.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=catcher|fn=thrower|fn=do_throw)'
 cleanup: rm -f tracegrind.out.test_exception.msgpack.lz4
diff --git a/tracegrind/tests/test_foo_bar_baz.post.exp b/tracegrind/tests/test_foo_bar_baz.post.exp
index 5f71183f9..ad3a60185 100644
--- a/tracegrind/tests/test_foo_bar_baz.post.exp
+++ b/tracegrind/tests/test_foo_bar_baz.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
diff --git a/tracegrind/tests/test_inline.post.exp b/tracegrind/tests/test_inline.post.exp
index 7156df4a3..f06c345cb 100644
--- a/tracegrind/tests/test_inline.post.exp
+++ b/tracegrind/tests/test_inline.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
diff --git a/tracegrind/tests/test_instr_toggle.post.exp b/tracegrind/tests/test_instr_toggle.post.exp
index 5c6df34b4..1ee05299d 100644
--- a/tracegrind/tests/test_instr_toggle.post.exp
+++ b/tracegrind/tests/test_instr_toggle.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=before-fibo
diff --git a/tracegrind/tests/test_longjmp.post.exp b/tracegrind/tests/test_longjmp.post.exp
index 47ff2d6de..d0524b77e 100644
--- a/tracegrind/tests/test_longjmp.post.exp
+++ b/tracegrind/tests/test_longjmp.post.exp
@@ -3,6 +3,7 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=ENTER_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
diff --git a/tracegrind/tests/test_longjmp.vgtest b/tracegrind/tests/test_longjmp.vgtest
index 6c438fb85..0291a7fbe 100644
--- a/tracegrind/tests/test_longjmp.vgtest
+++ b/tracegrind/tests/test_longjmp.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_longjmp.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_longjmp.msgpack.lz4 --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_longjmp.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=outer|fn=middle|fn=inner)'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_longjmp.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=outer|fn=middle|fn=inner)'
 cleanup: rm -f tracegrind.out.test_longjmp.msgpack.lz4
diff --git a/tracegrind/tests/test_marker.post.exp b/tracegrind/tests/test_marker.post.exp
index f2d89fdd8..cd8748b02 100644
--- a/tracegrind/tests/test_marker.post.exp
+++ b/tracegrind/tests/test_marker.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Showing N of N rows
 [N, 1, 0, 'start-work']
diff --git a/tracegrind/tests/test_nested_inlined.post.exp b/tracegrind/tests/test_nested_inlined.post.exp
index ca8fc06cc..0d0571af2 100644
--- a/tracegrind/tests/test_nested_inlined.post.exp
+++ b/tracegrind/tests/test_nested_inlined.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
diff --git a/tracegrind/tests/test_schema.post.exp b/tracegrind/tests/test_schema.post.exp
index e9dfbb564..d30dbc939 100644
--- a/tracegrind/tests/test_schema.post.exp
+++ b/tracegrind/tests/test_schema.post.exp
@@ -4,10 +4,11 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
diff --git a/tracegrind/tests/test_signal.post.exp b/tracegrind/tests/test_signal.post.exp
index e6a840785..ec413adf6 100644
--- a/tracegrind/tests/test_signal.post.exp
+++ b/tracegrind/tests/test_signal.post.exp
@@ -3,6 +3,7 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
diff --git a/tracegrind/tests/test_signal.vgtest b/tracegrind/tests/test_signal.vgtest
index 06b87f70c..66391dfa1 100644
--- a/tracegrind/tests/test_signal.vgtest
+++ b/tracegrind/tests/test_signal.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_signal.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_signal.msgpack.lz4 --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_signal.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=caller|fn=handler_fn)'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_signal.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=caller|fn=handler_fn)'
 cleanup: rm -f tracegrind.out.test_signal.msgpack.lz4
diff --git a/tracegrind/tests/test_syscall.post.exp b/tracegrind/tests/test_syscall.post.exp
index b4ff2479f..4f8a1c6fa 100644
--- a/tracegrind/tests/test_syscall.post.exp
+++ b/tracegrind/tests/test_syscall.post.exp
@@ -3,12 +3,13 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir', 'sysCount', 'sysTime', 'sysCpuTime']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir', 'sysCount', 'sysTime', 'sysCpuTime']
 Counter Units: {'sysCpuTime': 'ns', 'sysTime': 'ns'}
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
diff --git a/tracegrind/tests/test_syscall.vgtest b/tracegrind/tests/test_syscall.vgtest
index d4a57deec..848ca69f7 100644
--- a/tracegrind/tests/test_syscall.vgtest
+++ b/tracegrind/tests/test_syscall.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_syscall.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_syscall.msgpack.lz4 --instr-atstart=no --collect-systime=nsec
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counter Units:|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Counter Units:|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
 cleanup: rm -f tracegrind.out.test_syscall.msgpack.lz4
diff --git a/tracegrind/tests/test_tailcall.post.exp b/tracegrind/tests/test_tailcall.post.exp
index 2cd177ff5..9d08266f6 100644
--- a/tracegrind/tests/test_tailcall.post.exp
+++ b/tracegrind/tests/test_tailcall.post.exp
@@ -3,6 +3,7 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=ENTER_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
diff --git a/tracegrind/tests/test_tailcall.vgtest b/tracegrind/tests/test_tailcall.vgtest
index b971e1de7..c5acf2b7b 100644
--- a/tracegrind/tests/test_tailcall.vgtest
+++ b/tracegrind/tests/test_tailcall.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_tailcall.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_tailcall.msgpack.lz4 --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_tailcall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |MARKER|fn=chain_)'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_tailcall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=chain_)'
 cleanup: rm -f tracegrind.out.test_tailcall.msgpack.lz4
diff --git a/tracegrind/tests/test_thread_create.post.exp b/tracegrind/tests/test_thread_create.post.exp
index 91eef3fe8..d695fbc3d 100644
--- a/tracegrind/tests/test_thread_create.post.exp
+++ b/tracegrind/tests/test_thread_create.post.exp
@@ -3,12 +3,13 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
diff --git a/tracegrind/tests/test_thread_create.vgtest b/tracegrind/tests/test_thread_create.vgtest
index 3c6ff7704..6ce6328d2 100644
--- a/tracegrind/tests/test_thread_create.vgtest
+++ b/tracegrind/tests/test_thread_create.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_thread_create.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_thread_create.msgpack.lz4 --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_create.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|MARKER|THREAD_CREATE|FORK|thread_fn|Showing |\()' | sed 's/child_pid=[0-9]*/child_pid=N/'
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_create.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|MARKER|THREAD_CREATE|FORK|thread_fn|Showing |\()' | sed 's/child_pid=[0-9]*/child_pid=N/'
 cleanup: rm -f tracegrind.out.test_thread_create.msgpack.lz4
diff --git a/tracegrind/tests/test_thread_interleave.post.exp b/tracegrind/tests/test_thread_interleave.post.exp
index 906e77b66..63f1c227f 100644
--- a/tracegrind/tests/test_thread_interleave.post.exp
+++ b/tracegrind/tests/test_thread_interleave.post.exp
@@ -3,12 +3,13 @@ Format Name: tracegrind-msgpack
 Schema Version: N
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 Showing N of N rows
 seq=N | tid=1 | event=MARKER | marker=start
 seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
diff --git a/tracegrind/tests/test_thread_interleave.vgtest b/tracegrind/tests/test_thread_interleave.vgtest
index ab3ad0948..02ea2cd8d 100644
--- a/tracegrind/tests/test_thread_interleave.vgtest
+++ b/tracegrind/tests/test_thread_interleave.vgtest
@@ -1,5 +1,5 @@
 prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
 prog: test_thread_interleave.bin
 vgopts: --tracegrind-out-file=tracegrind.out.test_thread_interleave.msgpack.lz4 --instr-atstart=no
-post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_interleave.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Showing |\(|MARKER|THREAD_CREATE|fn=work_a |fn=work_b |fn=work_c |fn=depth_a1 |fn=depth_a2 |fn=depth_b1 |fn=depth_c1 |fn=depth_c2 )' | sort -t'|' -k2,2 -s
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_interleave.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |\(|MARKER|THREAD_CREATE|fn=work_a |fn=work_b |fn=work_c |fn=depth_a1 |fn=depth_a2 |fn=depth_b1 |fn=depth_c1 |fn=depth_c2 )' | sort -t'|' -k2,2 -s
 cleanup: rm -f tracegrind.out.test_thread_interleave.msgpack.lz4
diff --git a/tracegrind/tests/test_toggle_collect.post.exp b/tracegrind/tests/test_toggle_collect.post.exp
index c1c546e90..19397d9bc 100644
--- a/tracegrind/tests/test_toggle_collect.post.exp
+++ b/tracegrind/tests/test_toggle_collect.post.exp
@@ -4,12 +4,13 @@ Schema Version: N
 
 Event Schemas (discriminated union):
   0 (MARKER): ['seq', 'tid', 'event', 'marker']
-  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
-  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'Ir']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
   5 (FORK): ['seq', 'tid', 'event', 'child_pid']
   6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
 
 Total rows: N
 

From 6717f23cbcc963ad5679d4bc8e502c638fd1934d Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sun, 8 Feb 2026 11:39:57 +0000
Subject: [PATCH 24/26] docs(tracegrind): add example output files and
 generation script

Adds a directory for pre-generated tracegrind output files that serve as
reference material for trace parser implementations, along with a script
to regenerate them.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/examples/.gitignore  |  2 ++
 tracegrind/examples/README.md   | 59 ++++++++++++++++++++++++++++++
 tracegrind/examples/generate.sh | 63 +++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 tracegrind/examples/.gitignore
 create mode 100644 tracegrind/examples/README.md
 create mode 100755 tracegrind/examples/generate.sh

diff --git a/tracegrind/examples/.gitignore b/tracegrind/examples/.gitignore
new file mode 100644
index 000000000..585f5d244
--- /dev/null
+++ b/tracegrind/examples/.gitignore
@@ -0,0 +1,2 @@
+*.tgtrace
+*.txt
diff --git a/tracegrind/examples/README.md b/tracegrind/examples/README.md
new file mode 100644
index 000000000..d1a6b2834
--- /dev/null
+++ b/tracegrind/examples/README.md
@@ -0,0 +1,59 @@
+# Tracegrind example output files
+
+This directory contains pre-generated tracegrind output files for use as
+reference material when implementing a trace parser.
+
+Each test produces two files:
+
+- **`<name>.tgtrace`** — binary trace file (msgpack + lz4 compressed)
+- **`<name>.txt`** — full human-readable dump from `tracegrind-analyzer`
+
+## Files
+
+| Name | Description | Extra options |
+|------|-------------|---------------|
+| `test_basic` | Full program trace (loader + libc + main) | — |
+| `test_marker` | `VALGRIND_TRACEGRIND_MARKER` client request | — |
+| `test_toggle_collect` | `--toggle-collect` style collection | — |
+| `test_foo_bar_baz` | Simple call chain: `foo -> bar -> baz` | `--instr-atstart=no` |
+| `test_inline` | Inlined function calls | `--instr-atstart=no` |
+| `test_enter_inlined` | `ENTER_INLINED_FN` / `EXIT_INLINED_FN` events | `--instr-atstart=no --read-inline-info=yes` |
+| `test_nested_inlined` | Nested inlined function calls | `--instr-atstart=no --read-inline-info=yes` |
+| `test_recursion` | Recursive function calls | `--instr-atstart=no` |
+| `test_tailcall` | Tail-call optimized functions | `--instr-atstart=no` |
+| `test_longjmp` | `setjmp` / `longjmp` unwinding | `--instr-atstart=no` |
+| `test_signal` | Signal handler invocation | `--instr-atstart=no` |
+| `test_exception` | C++ exception throw/catch | `--instr-atstart=no` |
+| `test_thread_create` | `THREAD_CREATE` events | `--instr-atstart=no` |
+| `test_thread_interleave` | Multi-thread interleaved callstacks | `--instr-atstart=no` |
+| `test_syscall` | System call timing (`sysCount`, `sysTime` counters) | `--instr-atstart=no --collect-systime=nsec` |
+| `test_instr_toggle` | Instrumentation toggle on/off mid-run | `--instr-atstart=no` |
+
+## Regenerating
+
+From the repository root (after building valgrind):
+
+```bash
+bash tracegrind/examples/generate.sh
+```
+
+## Format
+
+The `.tgtrace` files use the tracegrind msgpack format (lz4-compressed msgpack).
+See `tracegrind/docs/tracegrind-msgpack-format.md` for the format specification.
+
+Use `tracegrind/scripts/tracegrind-analyzer` to inspect any trace file:
+
+```bash
+# Full dump
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace
+
+# Schema only
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --schema
+
+# Statistics
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --stats
+
+# Filter by event type
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --event ENTER_FN
+```
diff --git a/tracegrind/examples/generate.sh b/tracegrind/examples/generate.sh
new file mode 100755
index 000000000..00aa6b072
--- /dev/null
+++ b/tracegrind/examples/generate.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+#
+# Generate tracegrind example output files.
+#
+# Run from the valgrind-codspeed repository root:
+#   bash tracegrind/examples/generate.sh
+#
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+VG="$ROOT/vg-in-place"
+ANALYZER="$ROOT/tracegrind/scripts/tracegrind-analyzer"
+TESTS="$ROOT/tracegrind/tests"
+OUT="$ROOT/tracegrind/examples"
+
+if [ ! -x "$VG" ]; then
+    echo "Error: vg-in-place not found at $VG" >&2
+    echo "Build valgrind first (./configure && make)" >&2
+    exit 1
+fi
+
+generate() {
+    local name="$1"
+    local binary="$2"
+    shift 2
+    local vgopts=("$@")
+
+    local trace="$OUT/${name}.tgtrace"
+    local txt="$OUT/${name}.txt"
+
+    echo "Generating $name ..."
+    "$VG" --tool=tracegrind \
+        --tracegrind-out-file="$trace" \
+        "${vgopts[@]}" \
+        "$TESTS/$binary" > /dev/null 2>&1
+
+    "$ANALYZER" "$trace" > "$txt" 2>&1
+
+    echo "  -> $(wc -c < "$trace") bytes, $(wc -l < "$txt") lines"
+}
+
+# Remove previous outputs
+rm -f "$OUT"/*.tgtrace "$OUT"/*.txt
+
+generate test_basic              test_basic.bin
+generate test_marker             test_marker.bin
+generate test_toggle_collect     test_toggle_collect.bin
+generate test_foo_bar_baz        test_foo_bar_baz.bin        --instr-atstart=no
+generate test_inline             test_inline.bin             --instr-atstart=no
+generate test_enter_inlined      test_enter_inlined.bin      --instr-atstart=no --read-inline-info=yes
+generate test_nested_inlined     test_nested_inlined.bin     --instr-atstart=no --read-inline-info=yes
+generate test_recursion          test_recursion.bin          --instr-atstart=no
+generate test_tailcall           test_tailcall.bin           --instr-atstart=no
+generate test_longjmp            test_longjmp.bin            --instr-atstart=no
+generate test_signal             test_signal.bin             --instr-atstart=no
+generate test_exception          test_exception.bin          --instr-atstart=no
+generate test_thread_create      test_thread_create.bin      --instr-atstart=no
+generate test_thread_interleave  test_thread_interleave.bin  --instr-atstart=no
+generate test_syscall            test_syscall.bin            --instr-atstart=no --collect-systime=nsec
+generate test_instr_toggle       test_instr_toggle.bin       --instr-atstart=no
+
+echo ""
+echo "Done. Generated $(ls "$OUT"/*.tgtrace 2>/dev/null | wc -l) trace files."

From 3d773211ef8652235b37de26789f64361fcd15c8 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sun, 8 Feb 2026 12:33:32 +0000
Subject: [PATCH 25/26] perf(tracegrind): use faster LZ4 compression and cache
 string lengths

Perf profiling revealed LZ4 compression (11.4% of runtime) and
per-event strlen calls (4.6%) as the top two optimization targets.

Switch from LZ4_compress_default to LZ4_compress_fast with
acceleration=2 for faster compression at marginal ratio cost.

Cache name_len in fn_node, file_node, and obj_node structs so
msgpack_write_str receives pre-computed lengths instead of calling
VG_(strlen) on every trace event. This eliminates strlen from the
perf profile entirely.

Benchmarked improvement: 55-78ms saved (10-13% of the TG-CG gap)
on ls -lR /usr/share/doc workload.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tracegrind/dump.c   | 98 ++++++++++++++++++++++++++++++++++++---------
 tracegrind/fn.c     |  3 ++
 tracegrind/global.h |  3 ++
 tracegrind/tg_lz4.c |  5 ++-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index 540d64a3b..837f69b9f 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -292,8 +292,10 @@ static void msgpack_init_state(void)
 
 /* Add an ENTER/EXIT row to the msgpack output */
 static void msgpack_add_row(ULong seq, Int tid, Int event,
-                            const HChar* fn_name, const HChar* obj_name,
-                            const HChar* file_name, Int line,
+                            const HChar* fn_name, Int fn_len,
+                            const HChar* obj_name, Int obj_len,
+                            const HChar* file_name, Int file_len,
+                            Int line,
                             const ULong* deltas, Int n_deltas)
 {
     /* Each row is a msgpack array: 7 fixed + 1 counters sub-array */
@@ -303,9 +305,9 @@ static void msgpack_add_row(ULong seq, Int tid, Int event,
     msgpack_write_uint(&mp_state.buf, seq);
     msgpack_write_int(&mp_state.buf, tid);
     msgpack_write_int(&mp_state.buf, event);
-    msgpack_write_str(&mp_state.buf, fn_name, -1);
-    msgpack_write_str(&mp_state.buf, obj_name, -1);
-    msgpack_write_str(&mp_state.buf, file_name, -1);
+    msgpack_write_str(&mp_state.buf, fn_name, fn_len);
+    msgpack_write_str(&mp_state.buf, obj_name, obj_len);
+    msgpack_write_str(&mp_state.buf, file_name, file_len);
     msgpack_write_int(&mp_state.buf, line);
 
     /* Counters sub-array */
@@ -473,12 +475,35 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
 
     TG_(trace_out).seq++;
 
-    /* Resolve function info */
-    const HChar* fn_name = fn ? fn->name : "???";
-    const HChar* obj_name = (fn && fn->file && fn->file->obj)
-                            ? fn->file->obj->name : "???";
-    const HChar* file_name = (fn && fn->file) ? fn->file->name : "???";
-    UInt line = 0;
+    /* Resolve function info with cached lengths */
+    const HChar* fn_name;
+    Int fn_len;
+    const HChar* obj_name;
+    Int obj_len;
+    const HChar* file_name;
+    Int file_len;
+
+    if (fn) {
+        fn_name = fn->name;
+        fn_len = (Int)fn->name_len;
+        if (fn->file) {
+            file_name = fn->file->name;
+            file_len = (Int)fn->file->name_len;
+            if (fn->file->obj) {
+                obj_name = fn->file->obj->name;
+                obj_len = (Int)fn->file->obj->name_len;
+            } else {
+                obj_name = "???"; obj_len = 3;
+            }
+        } else {
+            file_name = "???"; file_len = 3;
+            obj_name = "???"; obj_len = 3;
+        }
+    } else {
+        fn_name = "???"; fn_len = 3;
+        obj_name = "???"; obj_len = 3;
+        file_name = "???"; file_len = 3;
+    }
 
     /* Compute deltas for all event counters */
     ULong deltas[64]; /* es->size is always small */
@@ -497,7 +522,8 @@ void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
     Int event_val = is_enter ? TG_EV_ENTER_FN : TG_EV_EXIT_FN;
 
     msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val,
-                    fn_name, obj_name, file_name, (Int)line,
+                    fn_name, fn_len, obj_name, obj_len,
+                    file_name, file_len, 0,
                     deltas, es->size);
 }
 
@@ -522,9 +548,24 @@ void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
     TG_(trace_out).seq++;
 
     const HChar* fn_name = inl_fn;
-    const HChar* obj_name = bb->obj ? bb->obj->name : "???";
-    const HChar* file_name = (bb->fn && bb->fn->file) ? bb->fn->file->name : "???";
-    UInt line = bb->line;
+    Int fn_len = -1; /* inlined fn names not cached, use strlen */
+    const HChar* obj_name;
+    Int obj_len;
+    const HChar* file_name;
+    Int file_len;
+
+    if (bb->obj) {
+        obj_name = bb->obj->name;
+        obj_len = (Int)bb->obj->name_len;
+    } else {
+        obj_name = "???"; obj_len = 3;
+    }
+    if (bb->fn && bb->fn->file) {
+        file_name = bb->fn->file->name;
+        file_len = (Int)bb->fn->file->name_len;
+    } else {
+        file_name = "???"; file_len = 3;
+    }
 
     ULong deltas[64];
     tl_assert(es->size <= 64);
@@ -540,7 +581,8 @@ void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
     }
 
     msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED_FN,
-                    fn_name, obj_name, file_name, (Int)line,
+                    fn_name, fn_len, obj_name, obj_len,
+                    file_name, file_len, (Int)bb->line,
                     deltas, es->size);
 }
 
@@ -565,9 +607,24 @@ void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
     TG_(trace_out).seq++;
 
     const HChar* fn_name = inl_fn;
-    const HChar* obj_name = bb->obj ? bb->obj->name : "???";
-    const HChar* file_name = (bb->fn && bb->fn->file) ? bb->fn->file->name : "???";
-    UInt line = bb->line;
+    Int fn_len = -1; /* inlined fn names not cached, use strlen */
+    const HChar* obj_name;
+    Int obj_len;
+    const HChar* file_name;
+    Int file_len;
+
+    if (bb->obj) {
+        obj_name = bb->obj->name;
+        obj_len = (Int)bb->obj->name_len;
+    } else {
+        obj_name = "???"; obj_len = 3;
+    }
+    if (bb->fn && bb->fn->file) {
+        file_name = bb->fn->file->name;
+        file_len = (Int)bb->fn->file->name_len;
+    } else {
+        file_name = "???"; file_len = 3;
+    }
 
     ULong deltas[64];
     tl_assert(es->size <= 64);
@@ -583,7 +640,8 @@ void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
     }
 
     msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED_FN,
-                    fn_name, obj_name, file_name, (Int)line,
+                    fn_name, fn_len, obj_name, obj_len,
+                    file_name, file_len, (Int)bb->line,
                     deltas, es->size);
 }
 
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index 710dfd00d..4c314b296 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -350,6 +350,7 @@ obj_node* new_obj_node(DebugInfo* di, obj_node* next)
 	if (obj->name[i]=='/') obj->last_slash_pos = i+1;
 	i++;
    }
+   obj->name_len = i;
 
    if (runtime_resolve_addrs == 0) search_runtime_resolve(obj);
 
@@ -388,6 +389,7 @@ file_node* new_file_node(const HChar *filename,
   file_node* file = (file_node*) TG_MALLOC("cl.fn.nfn.1",
                                            sizeof(file_node));
   file->name  = VG_(strdup)("cl.fn.nfn.2", filename);
+  file->name_len = VG_(strlen)(filename);
   for (i = 0; i < N_FN_ENTRIES; i++) {
     file->fns[i] = NULL;
   }
@@ -438,6 +440,7 @@ fn_node* new_fn_node(const HChar *fnname,
     fn_node* fn = (fn_node*) TG_MALLOC("cl.fn.nfnnd.1",
                                          sizeof(fn_node));
     fn->name = VG_(strdup)("cl.fn.nfnnd.2", fnname);
+    fn->name_len = VG_(strlen)(fnname);
 
     TG_(stat).distinct_fns++;
     fn->number   = TG_(stat).distinct_fns;
diff --git a/tracegrind/global.h b/tracegrind/global.h
index 86b883a89..0d042615e 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -389,6 +389,7 @@ struct _BBCC {
 
 struct _fn_node {
   HChar*     name;
+  UInt       name_len;
   UInt       number;
   Context*   last_cxt; /* LRU info */
   Context*   pure_cxt; /* the context with only the function itself */
@@ -415,6 +416,7 @@ struct _fn_node {
 
 struct _file_node {
    HChar*     name;
+   UInt       name_len;
    fn_node*   fns[N_FN_ENTRIES];
    obj_node*  obj;
    file_node* next;
@@ -426,6 +428,7 @@ struct _file_node {
  */
 struct _obj_node {
    const HChar* name;
+   UInt       name_len;
    UInt       last_slash_pos;
 
    Addr       start;  /* Start address of text segment mapping */
diff --git a/tracegrind/tg_lz4.c b/tracegrind/tg_lz4.c
index 7fa24f0e1..8d4e66531 100644
--- a/tracegrind/tg_lz4.c
+++ b/tracegrind/tg_lz4.c
@@ -86,8 +86,9 @@ SizeT tg_lz4_compress_bound(SizeT src_size)
 SizeT tg_lz4_compress(void* dst, SizeT dst_capacity,
                       const void* src, SizeT src_size)
 {
-    int result = LZ4_compress_default((const char*)src, (char*)dst,
-                                      (int)src_size, (int)dst_capacity);
+    int result = LZ4_compress_fast((const char*)src, (char*)dst,
+                                   (int)src_size, (int)dst_capacity,
+                                   2 /* acceleration */);
     if (result <= 0) {
         return 0;
     }

From 139e48e9fa16c919e2265a9e09e8e3a1d0ac3410 Mon Sep 17 00:00:00 2001
From: Arthur Pastel <arthur@codspeed.io>
Date: Sun, 8 Feb 2026 12:48:58 +0000
Subject: [PATCH 26/26] style(tracegrind): apply clang-format and add
 pre-commit hook

Add .pre-commit-config.yaml with clang-format scoped to tracegrind/ only.
Reformat all tracegrind source files to match the repo's .clang-format style.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .pre-commit-config.yaml                   |    7 +
 tracegrind/bb.c                           |  423 +-
 tracegrind/bbcc.c                         | 1182 +++--
 tracegrind/callstack.c                    |  614 ++-
 tracegrind/clo.c                          |  776 ++--
 tracegrind/context.c                      |  425 +-
 tracegrind/costs.c                        |   50 +-
 tracegrind/costs.h                        |   11 +-
 tracegrind/debug.c                        |  597 ++-
 tracegrind/dump.c                         | 1171 ++---
 tracegrind/events.c                       |  659 +--
 tracegrind/events.h                       |   74 +-
 tracegrind/fn.c                           | 1155 ++---
 tracegrind/global.h                       |  664 ++-
 tracegrind/jumps.c                        |  250 +-
 tracegrind/lz4.c                          | 4803 ++++++++++++---------
 tracegrind/lz4.h                          |  943 ++--
 tracegrind/main.c                         | 2666 ++++++------
 tracegrind/sim.c                          | 2424 ++++++-----
 tracegrind/tests/test_basic.c             |   15 +-
 tracegrind/tests/test_enter_inlined.c     |   47 +-
 tracegrind/tests/test_exception.cpp       |   46 +-
 tracegrind/tests/test_foo_bar_baz.c       |   27 +-
 tracegrind/tests/test_inline.c            |   30 +-
 tracegrind/tests/test_instr_toggle.c      |   27 +-
 tracegrind/tests/test_longjmp.c           |   54 +-
 tracegrind/tests/test_marker.c            |   22 +-
 tracegrind/tests/test_nested_inlined.c    |   62 +-
 tracegrind/tests/test_recursion.c         |   28 +-
 tracegrind/tests/test_signal.c            |   41 +-
 tracegrind/tests/test_syscall.c           |   39 +-
 tracegrind/tests/test_tailcall.c          |   29 +-
 tracegrind/tests/test_thread_create.c     |   26 +-
 tracegrind/tests/test_thread_interleave.c |   68 +-
 tracegrind/tests/test_toggle_collect.c    |   32 +-
 tracegrind/tg_lz4.c                       |   36 +-
 tracegrind/tg_lz4.h                       |    6 +-
 tracegrind/tg_msgpack.c                   |  264 +-
 tracegrind/tg_msgpack.h                   |    6 +-
 tracegrind/threads.c                      |  536 ++-
 tracegrind/tracegrind.h                   |   40 +-
 41 files changed, 10545 insertions(+), 9830 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..bc102d4a9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v14.0.6
+    hooks:
+      - id: clang-format
+        files: ^tracegrind/
+        types_or: [c, c++]
diff --git a/tracegrind/bb.c b/tracegrind/bb.c
index 9c69e7c6e..ff5f4111c 100644
--- a/tracegrind/bb.c
+++ b/tracegrind/bb.c
@@ -39,128 +39,123 @@ void TG_(init_bb_hash)(void)
 
    bbs.size    = 8437;
    bbs.entries = 0;
-   bbs.table = (BB**) TG_MALLOC("cl.bb.ibh.1",
-                                 bbs.size * sizeof(BB*));
+   bbs.table   = (BB**)TG_MALLOC("cl.bb.ibh.1", bbs.size * sizeof(BB*));
 
-   for (i = 0; i < bbs.size; i++) bbs.table[i] = NULL;
+   for (i = 0; i < bbs.size; i++)
+      bbs.table[i] = NULL;
 }
 
-bb_hash* TG_(get_bb_hash)(void)
-{
-  return &bbs;
-}
+bb_hash* TG_(get_bb_hash)(void) { return &bbs; }
 
 /* The hash stores BBs according to
  * - ELF object (is 0 for code in anonymous mapping)
  * - BB base as object file offset
  */
-static __inline__
-UInt bb_hash_idx(obj_node* obj, PtrdiffT offset, UInt size)
+static __inline__ UInt bb_hash_idx(obj_node* obj, PtrdiffT offset, UInt size)
 {
-  return (((Addr)obj) + offset) % size;
+   return (((Addr)obj) + offset) % size;
 }
 
 /* double size of bb table  */
-static
-void resize_bb_table(void)
+static void resize_bb_table(void)
 {
-    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
-    BB **new_table, *curr, *next;
-    UInt new_idx;
-
-    new_size  = 2* bbs.size +3;
-    new_table = (BB**) TG_MALLOC("cl.bb.rbt.1",
-                                  new_size * sizeof(BB*));
- 
-    for (i = 0; i < new_size; i++)
+   Int  i, new_size, conflicts1 = 0, conflicts2 = 0;
+   BB **new_table, *curr, *next;
+   UInt new_idx;
+
+   new_size  = 2 * bbs.size + 3;
+   new_table = (BB**)TG_MALLOC("cl.bb.rbt.1", new_size * sizeof(BB*));
+
+   for (i = 0; i < new_size; i++)
       new_table[i] = NULL;
- 
-    for (i = 0; i < bbs.size; i++) {
-	if (bbs.table[i] == NULL) continue;
- 
-	curr = bbs.table[i];
-	while (NULL != curr) {
-	    next = curr->next;
-
-	    new_idx = bb_hash_idx(curr->obj, curr->offset, new_size);
-
-	    curr->next = new_table[new_idx];
-	    new_table[new_idx] = curr;
-	    if (curr->next) {
-		conflicts1++;
-		if (curr->next->next)
-		    conflicts2++;
-	    }
-
-	    curr = next;
-	}
-    }
-
-    VG_(free)(bbs.table);
-
-
-    TG_DEBUG(0, "Resize BB Hash: %u => %d (entries %u, conflicts %d/%d)\n",
-	     bbs.size, new_size,
-	     bbs.entries, conflicts1, conflicts2);
-
-    bbs.size  = new_size;
-    bbs.table = new_table;
-    TG_(stat).bb_hash_resizes++;
-}
 
+   for (i = 0; i < bbs.size; i++) {
+      if (bbs.table[i] == NULL)
+         continue;
+
+      curr = bbs.table[i];
+      while (NULL != curr) {
+         next = curr->next;
+
+         new_idx = bb_hash_idx(curr->obj, curr->offset, new_size);
+
+         curr->next         = new_table[new_idx];
+         new_table[new_idx] = curr;
+         if (curr->next) {
+            conflicts1++;
+            if (curr->next->next)
+               conflicts2++;
+         }
+
+         curr = next;
+      }
+   }
+
+   VG_(free)(bbs.table);
+
+   TG_DEBUG(0, "Resize BB Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+            bbs.size, new_size, bbs.entries, conflicts1, conflicts2);
+
+   bbs.size  = new_size;
+   bbs.table = new_table;
+   TG_(stat).bb_hash_resizes++;
+}
 
 /**
  * Allocate new BB structure (including space for event type list)
  * Not initialized:
  * - instr_len, cost_count, instr[]
  */
-static BB* new_bb(obj_node* obj, PtrdiffT offset,
-		  UInt instr_count, UInt cjmp_count, Bool cjmp_inverted)
+static BB* new_bb(obj_node* obj,
+                  PtrdiffT  offset,
+                  UInt      instr_count,
+                  UInt      cjmp_count,
+                  Bool      cjmp_inverted)
 {
-   BB* bb;
+   BB*  bb;
    UInt idx, size;
 
    /* check fill degree of bb hash table and resize if needed (>80%) */
    bbs.entries++;
    if (10 * bbs.entries / bbs.size > 8)
-       resize_bb_table();
+      resize_bb_table();
 
-   size = sizeof(BB) + instr_count * sizeof(InstrInfo)
-                     + (cjmp_count+1) * sizeof(CJmpInfo);
-   bb = (BB*) TG_MALLOC("cl.bb.nb.1", size);
+   size = sizeof(BB) + instr_count * sizeof(InstrInfo) +
+          (cjmp_count + 1) * sizeof(CJmpInfo);
+   bb = (BB*)TG_MALLOC("cl.bb.nb.1", size);
    VG_(memset)(bb, 0, size);
 
-   bb->obj        = obj;
-   bb->offset     = offset;
-   
-   bb->instr_count = instr_count;
-   bb->cjmp_count  = cjmp_count;
+   bb->obj    = obj;
+   bb->offset = offset;
+
+   bb->instr_count   = instr_count;
+   bb->cjmp_count    = cjmp_count;
    bb->cjmp_inverted = cjmp_inverted;
-   bb->jmp         = (CJmpInfo*) &(bb->instr[instr_count]);
-   bb->instr_len   = 0;
-   bb->cost_count  = 0;
-   bb->sect_kind   = VG_(DebugInfo_sect_kind)(NULL, offset + obj->offset);
-   bb->fn          = 0;
-   bb->line        = 0;
-   bb->is_entry    = 0;
-   bb->inl_fns     = NULL;
-   bb->inl_depth   = 0;
-   bb->bbcc_list   = 0;
-   bb->last_bbcc   = 0;
+   bb->jmp           = (CJmpInfo*)&(bb->instr[instr_count]);
+   bb->instr_len     = 0;
+   bb->cost_count    = 0;
+   bb->sect_kind     = VG_(DebugInfo_sect_kind)(NULL, offset + obj->offset);
+   bb->fn            = 0;
+   bb->line          = 0;
+   bb->is_entry      = 0;
+   bb->inl_fns       = NULL;
+   bb->inl_depth     = 0;
+   bb->bbcc_list     = 0;
+   bb->last_bbcc     = 0;
 
    /* insert into BB hash table */
-   idx = bb_hash_idx(obj, offset, bbs.size);
-   bb->next = bbs.table[idx];
+   idx            = bb_hash_idx(obj, offset, bbs.size);
+   bb->next       = bbs.table[idx];
    bbs.table[idx] = bb;
 
    TG_(stat).distinct_bbs++;
 
 #if TG_ENABLE_DEBUG
-   TG_DEBUGIF(3) {
-     VG_(printf)("  new_bb (instr %u, jmps %u, inv %s) [now %d]: ",
-		 instr_count, cjmp_count,
-		 cjmp_inverted ? "yes":"no",
-		 TG_(stat).distinct_bbs);
+   TG_DEBUGIF(3)
+   {
+      VG_(printf)(
+         "  new_bb (instr %u, jmps %u, inv %s) [now %d]: ", instr_count,
+         cjmp_count, cjmp_inverted ? "yes" : "no", TG_(stat).distinct_bbs);
       TG_(print_bb)(0, bb);
       VG_(printf)("\n");
    }
@@ -171,56 +166,54 @@ static BB* new_bb(obj_node* obj, PtrdiffT offset,
    return bb;
 }
 
-
 /* get the BB structure for a BB start address */
-static __inline__
-BB* lookup_bb(obj_node* obj, PtrdiffT offset)
+static __inline__ BB* lookup_bb(obj_node* obj, PtrdiffT offset)
 {
-    BB* bb;
-    Int idx;
+   BB* bb;
+   Int idx;
 
-    idx = bb_hash_idx(obj, offset, bbs.size);
-    bb = bbs.table[idx];
+   idx = bb_hash_idx(obj, offset, bbs.size);
+   bb  = bbs.table[idx];
 
-    while(bb) {
-      if ((bb->obj == obj) && (bb->offset == offset)) break;
+   while (bb) {
+      if ((bb->obj == obj) && (bb->offset == offset))
+         break;
       bb = bb->next;
-    }
+   }
 
-    TG_DEBUG(5, "  lookup_bb (Obj %s, off %#lx): %p\n",
-              obj->name, (UWord)offset, bb);
-    return bb;
+   TG_DEBUG(5, "  lookup_bb (Obj %s, off %#lx): %p\n", obj->name, (UWord)offset,
+            bb);
+   return bb;
 }
 
-static __inline__
-obj_node* obj_of_address(Addr addr)
+static __inline__ obj_node* obj_of_address(Addr addr)
 {
-  obj_node* obj;
-  DebugInfo* di;
-  PtrdiffT offset;
-
-  DiEpoch ep = VG_(current_DiEpoch)();
-  di = VG_(find_DebugInfo)(ep, addr);
-  obj = TG_(get_obj_node)( di );
-
-  /* Update symbol offset in object if remapped */
-  /* FIXME (or at least check this) 2008 Feb 19: 'offset' is
-     only correct for text symbols, not for data symbols */
-  offset = di ? VG_(DebugInfo_get_text_bias)(di):0;
-  if (obj->offset != offset) {
+   obj_node*  obj;
+   DebugInfo* di;
+   PtrdiffT   offset;
+
+   DiEpoch ep = VG_(current_DiEpoch)();
+   di         = VG_(find_DebugInfo)(ep, addr);
+   obj        = TG_(get_obj_node)(di);
+
+   /* Update symbol offset in object if remapped */
+   /* FIXME (or at least check this) 2008 Feb 19: 'offset' is
+      only correct for text symbols, not for data symbols */
+   offset = di ? VG_(DebugInfo_get_text_bias)(di) : 0;
+   if (obj->offset != offset) {
       Addr start = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
 
-      TG_DEBUG(0, "Mapping changed for '%s': %#lx -> %#lx\n",
-		obj->name, obj->start, start);
+      TG_DEBUG(0, "Mapping changed for '%s': %#lx -> %#lx\n", obj->name,
+               obj->start, start);
 
       /* Size should be the same, and offset diff == start diff */
-      TG_ASSERT( obj->size == (di ? VG_(DebugInfo_get_text_size)(di) : 0) );
-      TG_ASSERT( obj->start - start == obj->offset - offset );
+      TG_ASSERT(obj->size == (di ? VG_(DebugInfo_get_text_size)(di) : 0));
+      TG_ASSERT(obj->start - start == obj->offset - offset);
       obj->offset = offset;
-      obj->start = start;
-  }
+      obj->start  = start;
+   }
 
-  return obj;
+   return obj;
 }
 
 /* Get the BB structure for a BB start address.
@@ -238,111 +231,111 @@ obj_node* obj_of_address(Addr addr)
  * bbIn==0 is possible for artificial BB without real code.
  * Such a BB is created when returning to an unknown function.
  */
-BB* TG_(get_bb)(Addr addr, IRSB* bbIn, /*OUT*/ Bool *seen_before)
+BB* TG_(get_bb)(Addr addr, IRSB* bbIn, /*OUT*/ Bool* seen_before)
 {
-  BB*   bb;
-  obj_node* obj;
-  UInt n_instrs, n_jmps;
-  Bool cjmp_inverted = False;
-
-  TG_DEBUG(5, "+ get_bb(BB %#lx)\n", addr);
-
-  obj = obj_of_address(addr);
-  bb = lookup_bb(obj, addr - obj->offset);
-
-  n_instrs = 0;
-  n_jmps = 0;
-  TG_(collectBlockInfo)(bbIn, &n_instrs, &n_jmps, &cjmp_inverted);
-
-  *seen_before = bb ? True : False;
-  if (*seen_before) {
-    if (bb->instr_count != n_instrs) {
-      VG_(message)(Vg_DebugMsg, 
-		   "ERROR: BB Retranslation Mismatch at BB %#lx\n", addr);
-      VG_(message)(Vg_DebugMsg,
-		   "  new: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
-		   obj->name, (UWord)obj->offset,
-		   addr - obj->offset, n_instrs);
-      VG_(message)(Vg_DebugMsg,
-		   "  old: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
-		   bb->obj->name, (UWord)bb->obj->offset,
-		   (UWord)bb->offset, bb->instr_count);
-      TG_ASSERT(bb->instr_count == n_instrs );
-    }
-    TG_ASSERT(bb->cjmp_count == n_jmps );
-    TG_(stat).bb_retranslations++;
-
-    TG_DEBUG(5, "- get_bb(BB %#lx): seen before.\n", addr);
-    return bb;
-  }
-
-  bb = new_bb(obj, addr - obj->offset, n_instrs, n_jmps, cjmp_inverted);
-
-  TG_DEBUG(5, "- get_bb(BB %#lx)\n", addr);
-
-  return bb;
+   BB*       bb;
+   obj_node* obj;
+   UInt      n_instrs, n_jmps;
+   Bool      cjmp_inverted = False;
+
+   TG_DEBUG(5, "+ get_bb(BB %#lx)\n", addr);
+
+   obj = obj_of_address(addr);
+   bb  = lookup_bb(obj, addr - obj->offset);
+
+   n_instrs = 0;
+   n_jmps   = 0;
+   TG_(collectBlockInfo)(bbIn, &n_instrs, &n_jmps, &cjmp_inverted);
+
+   *seen_before = bb ? True : False;
+   if (*seen_before) {
+      if (bb->instr_count != n_instrs) {
+         VG_(message)(Vg_DebugMsg,
+                      "ERROR: BB Retranslation Mismatch at BB %#lx\n", addr);
+         VG_(message)(
+            Vg_DebugMsg, "  new: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
+            obj->name, (UWord)obj->offset, addr - obj->offset, n_instrs);
+         VG_(message)(Vg_DebugMsg,
+                      "  old: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
+                      bb->obj->name, (UWord)bb->obj->offset, (UWord)bb->offset,
+                      bb->instr_count);
+         TG_ASSERT(bb->instr_count == n_instrs);
+      }
+      TG_ASSERT(bb->cjmp_count == n_jmps);
+      TG_(stat).bb_retranslations++;
+
+      TG_DEBUG(5, "- get_bb(BB %#lx): seen before.\n", addr);
+      return bb;
+   }
+
+   bb = new_bb(obj, addr - obj->offset, n_instrs, n_jmps, cjmp_inverted);
+
+   TG_DEBUG(5, "- get_bb(BB %#lx)\n", addr);
+
+   return bb;
 }
 
 /* Delete the BB info for the bb with unredirected entry-point
    address 'addr'. */
 void TG_(delete_bb)(Addr addr)
 {
-    BB  *bb, *bp;
-    Int idx, size;
-
-    obj_node* obj = obj_of_address(addr);
-    PtrdiffT offset = addr - obj->offset;
+   BB *bb, *bp;
+   Int idx, size;
 
-    idx = bb_hash_idx(obj, offset, bbs.size);
-    bb = bbs.table[idx];
+   obj_node* obj    = obj_of_address(addr);
+   PtrdiffT  offset = addr - obj->offset;
 
-    /* bb points at the current bb under consideration, and bp is the
-       one before. */
-    bp = NULL;
-    while(bb) {
-      if ((bb->obj == obj) && (bb->offset == offset)) break;
+   idx = bb_hash_idx(obj, offset, bbs.size);
+   bb  = bbs.table[idx];
+
+   /* bb points at the current bb under consideration, and bp is the
+      one before. */
+   bp = NULL;
+   while (bb) {
+      if ((bb->obj == obj) && (bb->offset == offset))
+         break;
       bp = bb;
       bb = bb->next;
-    }
-
-    if (bb == NULL) {
-	TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): NOT FOUND\n",
-		  obj->name, (UWord)offset);
-
-	/* we didn't find it.
-	 * this happens when tracegrinds instrumentation mode
-	 * was off at BB translation time, ie. no BB was created.
-	 */
-	return;
-    }
-
-    /* unlink it from hash table */
-
-    if (bp == NULL) {
-       /* we found the first one in the list. */
-       tl_assert(bb == bbs.table[idx]);
-       bbs.table[idx] = bb->next;
-    } else {
-       tl_assert(bb != bbs.table[idx]);
-       bp->next = bb->next;
-    }
-
-    TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): %p, BBCC head: %p\n",
-	      obj->name, (UWord)offset, bb, bb->bbcc_list);
-
-    if (bb->bbcc_list == 0) {
-	/* can be safely deleted */
-
-	if (bb->inl_fns) VG_(free)(bb->inl_fns);
-
-	/* Fill the block up with junk and then free it, so we will
-	   hopefully get a segfault if it is used again by mistake. */
-	size = sizeof(BB)
-	    + bb->instr_count * sizeof(InstrInfo)
-	    + (bb->cjmp_count+1) * sizeof(CJmpInfo);
-	VG_(memset)( bb, 0xAA, size );
-	TG_FREE(bb);
-	return;
-    }
-    TG_DEBUG(3, "  delete_bb: BB in use, can not free!\n");
+   }
+
+   if (bb == NULL) {
+      TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): NOT FOUND\n", obj->name,
+               (UWord)offset);
+
+      /* we didn't find it.
+       * this happens when tracegrinds instrumentation mode
+       * was off at BB translation time, ie. no BB was created.
+       */
+      return;
+   }
+
+   /* unlink it from hash table */
+
+   if (bp == NULL) {
+      /* we found the first one in the list. */
+      tl_assert(bb == bbs.table[idx]);
+      bbs.table[idx] = bb->next;
+   } else {
+      tl_assert(bb != bbs.table[idx]);
+      bp->next = bb->next;
+   }
+
+   TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): %p, BBCC head: %p\n", obj->name,
+            (UWord)offset, bb, bb->bbcc_list);
+
+   if (bb->bbcc_list == 0) {
+      /* can be safely deleted */
+
+      if (bb->inl_fns)
+         VG_(free)(bb->inl_fns);
+
+      /* Fill the block up with junk and then free it, so we will
+         hopefully get a segfault if it is used again by mistake. */
+      size = sizeof(BB) + bb->instr_count * sizeof(InstrInfo) +
+             (bb->cjmp_count + 1) * sizeof(CJmpInfo);
+      VG_(memset)(bb, 0xAA, size);
+      TG_FREE(bb);
+      return;
+   }
+   TG_DEBUG(3, "  delete_bb: BB in use, can not free!\n");
 }
diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
index 597f61e64..15143c621 100644
--- a/tracegrind/bbcc.c
+++ b/tracegrind/bbcc.c
@@ -24,8 +24,8 @@
    The GNU General Public License is contained in the file COPYING.
 */
 
-#include "global.h"
 #include "costs.h"
+#include "global.h"
 
 #include "pub_tool_threadstate.h"
 
@@ -33,7 +33,7 @@
 /*--- BBCC operations                                      ---*/
 /*------------------------------------------------------------*/
 
-#define N_BBCC_INITIAL_ENTRIES  10437
+#define N_BBCC_INITIAL_ENTRIES 10437
 
 /* BBCC table (key is BB/Context), per thread, resizable */
 bbcc_hash current_bbccs;
@@ -46,33 +46,31 @@ void TG_(init_bbcc_hash)(bbcc_hash* bbccs)
 
    bbccs->size    = N_BBCC_INITIAL_ENTRIES;
    bbccs->entries = 0;
-   bbccs->table = (BBCC**) TG_MALLOC("cl.bbcc.ibh.1",
-                                      bbccs->size * sizeof(BBCC*));
+   bbccs->table =
+      (BBCC**)TG_MALLOC("cl.bbcc.ibh.1", bbccs->size * sizeof(BBCC*));
 
-   for (i = 0; i < bbccs->size; i++) bbccs->table[i] = NULL;
+   for (i = 0; i < bbccs->size; i++)
+      bbccs->table[i] = NULL;
 }
 
 void TG_(copy_current_bbcc_hash)(bbcc_hash* dst)
 {
-  TG_ASSERT(dst != 0);
+   TG_ASSERT(dst != 0);
 
-  dst->size    = current_bbccs.size;
-  dst->entries = current_bbccs.entries;
-  dst->table   = current_bbccs.table;
+   dst->size    = current_bbccs.size;
+   dst->entries = current_bbccs.entries;
+   dst->table   = current_bbccs.table;
 }
 
-bbcc_hash* TG_(get_current_bbcc_hash)(void)
-{
-  return &current_bbccs;
-}
+bbcc_hash* TG_(get_current_bbcc_hash)(void) { return &current_bbccs; }
 
 void TG_(set_current_bbcc_hash)(bbcc_hash* h)
 {
-  TG_ASSERT(h != 0);
+   TG_ASSERT(h != 0);
 
-  current_bbccs.size    = h->size;
-  current_bbccs.entries = h->entries;
-  current_bbccs.table   = h->table;
+   current_bbccs.size    = h->size;
+   current_bbccs.entries = h->entries;
+   current_bbccs.table   = h->table;
 }
 
 /* All BBCCs for recursion level 0 are inserted into a
@@ -85,120 +83,110 @@ void TG_(set_current_bbcc_hash)(bbcc_hash* h)
  * counters to be changed in the execution of a BB.
  */
 
-static __inline__
-UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
+static __inline__ UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
 {
    TG_ASSERT(bb != 0);
    TG_ASSERT(cxt != 0);
 
    return ((Addr)bb + (Addr)cxt) % size;
 }
- 
 
 /* Lookup for a BBCC in hash.
- */ 
-static
-BBCC* lookup_bbcc(BB* bb, Context* cxt)
+ */
+static BBCC* lookup_bbcc(BB* bb, Context* cxt)
 {
    BBCC* bbcc = bb->last_bbcc;
    UInt  idx;
 
    /* check LRU */
    if (bbcc->cxt == cxt) {
-       if (!TG_(clo).separate_threads) {
-	   /* if we don't dump threads separate, tid doesn't have to match */
-	   return bbcc;
-       }
-       if (bbcc->tid == TG_(current_tid)) return bbcc;
+      if (!TG_(clo).separate_threads) {
+         /* if we don't dump threads separate, tid doesn't have to match */
+         return bbcc;
+      }
+      if (bbcc->tid == TG_(current_tid))
+         return bbcc;
    }
 
    TG_(stat).bbcc_lru_misses++;
 
-   idx = bbcc_hash_idx(bb, cxt, current_bbccs.size);
+   idx  = bbcc_hash_idx(bb, cxt, current_bbccs.size);
    bbcc = current_bbccs.table[idx];
-   while (bbcc &&
-	  (bb      != bbcc->bb ||
-	   cxt     != bbcc->cxt)) {
-       bbcc = bbcc->next;
+   while (bbcc && (bb != bbcc->bb || cxt != bbcc->cxt)) {
+      bbcc = bbcc->next;
    }
-   
-   TG_DEBUG(2,"  lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n",
-	    bb_addr(bb), cxt->base_number, cxt->fn[0]->name, 
-	    bbcc, bbcc ? bbcc->tid : 0);
+
+   TG_DEBUG(2, "  lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n",
+            bb_addr(bb), cxt->base_number, cxt->fn[0]->name, bbcc,
+            bbcc ? bbcc->tid : 0);
 
    TG_DEBUGIF(2)
-     if (bbcc) TG_(print_bbcc)(-2,bbcc);
+   if (bbcc)
+      TG_(print_bbcc)(-2, bbcc);
 
    return bbcc;
 }
 
-
 /* double size of hash table 1 (addr->BBCC) */
 static void resize_bbcc_hash(void)
 {
-    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
-    BBCC** new_table;
-    UInt new_idx;
-    BBCC *curr_BBCC, *next_BBCC;
-
-    new_size = 2*current_bbccs.size+3;
-    new_table = (BBCC**) TG_MALLOC("cl.bbcc.rbh.1",
-                                    new_size * sizeof(BBCC*));
- 
-    for (i = 0; i < new_size; i++)
+   Int    i, new_size, conflicts1 = 0, conflicts2 = 0;
+   BBCC** new_table;
+   UInt   new_idx;
+   BBCC * curr_BBCC, *next_BBCC;
+
+   new_size  = 2 * current_bbccs.size + 3;
+   new_table = (BBCC**)TG_MALLOC("cl.bbcc.rbh.1", new_size * sizeof(BBCC*));
+
+   for (i = 0; i < new_size; i++)
       new_table[i] = NULL;
- 
-    for (i = 0; i < current_bbccs.size; i++) {
-	if (current_bbccs.table[i] == NULL) continue;
- 
-	curr_BBCC = current_bbccs.table[i];
-	while (NULL != curr_BBCC) {
-	    next_BBCC = curr_BBCC->next;
-
-	    new_idx = bbcc_hash_idx(curr_BBCC->bb,
-				    curr_BBCC->cxt,
-				    new_size);
-
-	    curr_BBCC->next = new_table[new_idx];
-	    new_table[new_idx] = curr_BBCC;
-	    if (curr_BBCC->next) {
-		conflicts1++;
-		if (curr_BBCC->next->next)
-		    conflicts2++;
-	    }
-
-	    curr_BBCC = next_BBCC;
-	}
-    }
-
-    VG_(free)(current_bbccs.table);
-
-
-    TG_DEBUG(0,"Resize BBCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
-	     current_bbccs.size, new_size,
-	     current_bbccs.entries, conflicts1, conflicts2);
-
-    current_bbccs.size = new_size;
-    current_bbccs.table = new_table;
-    TG_(stat).bbcc_hash_resizes++;
-}
 
+   for (i = 0; i < current_bbccs.size; i++) {
+      if (current_bbccs.table[i] == NULL)
+         continue;
+
+      curr_BBCC = current_bbccs.table[i];
+      while (NULL != curr_BBCC) {
+         next_BBCC = curr_BBCC->next;
+
+         new_idx = bbcc_hash_idx(curr_BBCC->bb, curr_BBCC->cxt, new_size);
 
-static __inline
-BBCC** new_recursion(int size)
+         curr_BBCC->next    = new_table[new_idx];
+         new_table[new_idx] = curr_BBCC;
+         if (curr_BBCC->next) {
+            conflicts1++;
+            if (curr_BBCC->next->next)
+               conflicts2++;
+         }
+
+         curr_BBCC = next_BBCC;
+      }
+   }
+
+   VG_(free)(current_bbccs.table);
+
+   TG_DEBUG(0, "Resize BBCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+            current_bbccs.size, new_size, current_bbccs.entries, conflicts1,
+            conflicts2);
+
+   current_bbccs.size  = new_size;
+   current_bbccs.table = new_table;
+   TG_(stat).bbcc_hash_resizes++;
+}
+
+static __inline BBCC** new_recursion(int size)
 {
-    BBCC** bbccs;
-    int i;
+   BBCC** bbccs;
+   int    i;
 
-    bbccs = (BBCC**) TG_MALLOC("cl.bbcc.nr.1", sizeof(BBCC*) * size);
-    for(i=0;i<size;i++)
-	bbccs[i] = 0;
+   bbccs = (BBCC**)TG_MALLOC("cl.bbcc.nr.1", sizeof(BBCC*) * size);
+   for (i = 0; i < size; i++)
+      bbccs[i] = 0;
 
-    TG_DEBUG(3,"  new_recursion(size %d): %p\n", size, bbccs);
+   TG_DEBUG(3, "  new_recursion(size %d): %p\n", size, bbccs);
 
-    return bbccs;
+   return bbccs;
 }
-  
 
 /*
  * Allocate a new BBCC
@@ -206,45 +194,42 @@ BBCC** new_recursion(int size)
  * Uninitialized:
  * cxt, rec_index, rec_array, next_bbcc, next1, next2
  */
-static __inline__ 
-BBCC* new_bbcc(BB* bb)
+static __inline__ BBCC* new_bbcc(BB* bb)
 {
    BBCC* bbcc;
-   Int i;
+   Int   i;
 
    /* We need cjmp_count+1 JmpData structs:
     * the last is for the unconditional jump/call/ret at end of BB
     */
-   bbcc = (BBCC*)TG_MALLOC("cl.bbcc.nb.1",
-			    sizeof(BBCC) +
-			    (bb->cjmp_count+1) * sizeof(JmpData));
+   bbcc = (BBCC*)TG_MALLOC("cl.bbcc.nb.1", sizeof(BBCC) + (bb->cjmp_count + 1) *
+                                                             sizeof(JmpData));
    bbcc->bb  = bb;
    bbcc->tid = TG_(current_tid);
 
    bbcc->skipped = 0;
-   bbcc->cost = TG_(get_costarray)(bb->cost_count);
-   for(i=0;i<bb->cost_count;i++)
-     bbcc->cost[i] = 0;
-   for(i=0; i<=bb->cjmp_count; i++) {
-       bbcc->jmp[i].ecounter = 0;
-       bbcc->jmp[i].jcc_list = 0;
+   bbcc->cost    = TG_(get_costarray)(bb->cost_count);
+   for (i = 0; i < bb->cost_count; i++)
+      bbcc->cost[i] = 0;
+   for (i = 0; i <= bb->cjmp_count; i++) {
+      bbcc->jmp[i].ecounter = 0;
+      bbcc->jmp[i].jcc_list = 0;
    }
    bbcc->ecounter_sum = 0;
 
    /* Init pointer caches (LRU) */
    bbcc->lru_next_bbcc = 0;
    bbcc->lru_from_jcc  = 0;
-   bbcc->lru_to_jcc  = 0;
-   
+   bbcc->lru_to_jcc    = 0;
+
    TG_(stat).distinct_bbccs++;
 
-   TG_DEBUG(3, "  new_bbcc(BB %#lx): %p (now %d)\n",
-	    bb_addr(bb), bbcc, TG_(stat).distinct_bbccs);
+   TG_DEBUG(3, "  new_bbcc(BB %#lx): %p (now %d)\n", bb_addr(bb), bbcc,
+            TG_(stat).distinct_bbccs);
 
    return bbcc;
 }
 
-
 /**
  * Inserts a new BBCC into hashes.
  * BBCC specific items must be set as this is used for the hash
@@ -256,53 +241,51 @@ BBCC* new_bbcc(BB* bb)
  * Recursion level doesn't need to be set as this is not included
  * in the hash key: Only BBCCs with rec level 0 are in hashes.
  */
-static
-void insert_bbcc_into_hash(BBCC* bbcc)
+static void insert_bbcc_into_hash(BBCC* bbcc)
 {
-    UInt idx;
-    
-    TG_ASSERT(bbcc->cxt != 0);
+   UInt idx;
 
-    TG_DEBUG(3,"+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n",
-	     bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name);
+   TG_ASSERT(bbcc->cxt != 0);
 
-    /* check fill degree of hash and resize if needed (>90%) */
-    current_bbccs.entries++;
-    if (100 * current_bbccs.entries / current_bbccs.size > 90)
-	resize_bbcc_hash();
+   TG_DEBUG(3, "+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n", bb_addr(bbcc->bb),
+            bbcc->cxt->fn[0]->name);
 
-    idx = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size);
-    bbcc->next = current_bbccs.table[idx];
-    current_bbccs.table[idx] = bbcc;
+   /* check fill degree of hash and resize if needed (>90%) */
+   current_bbccs.entries++;
+   if (100 * current_bbccs.entries / current_bbccs.size > 90)
+      resize_bbcc_hash();
 
-    TG_DEBUG(3,"- insert_bbcc_into_hash: %u entries\n",
-	     current_bbccs.entries);
+   idx        = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size);
+   bbcc->next = current_bbccs.table[idx];
+   current_bbccs.table[idx] = bbcc;
+
+   TG_DEBUG(3, "- insert_bbcc_into_hash: %u entries\n", current_bbccs.entries);
 }
 
 /* String is returned in a dynamically allocated buffer. Caller is
    responsible for free'ing it. */
 static HChar* mangled_cxt(const Context* cxt, Int rec_index)
 {
-    Int i, p;
+   Int i, p;
 
-    if (!cxt) return VG_(strdup)("cl.bbcc.mcxt", "(no context)");
+   if (!cxt)
+      return VG_(strdup)("cl.bbcc.mcxt", "(no context)");
 
-    /* Overestimate the number of bytes we need to hold the string. */
-    SizeT need = 20;   // rec_index + nul-terminator
-    for (i = 0; i < cxt->size; ++i)
-       need += VG_(strlen)(cxt->fn[i]->name) + 1;   // 1 for leading '
+   /* Overestimate the number of bytes we need to hold the string. */
+   SizeT need = 20; // rec_index + nul-terminator
+   for (i = 0; i < cxt->size; ++i)
+      need += VG_(strlen)(cxt->fn[i]->name) + 1; // 1 for leading '
 
-    HChar *mangled = TG_MALLOC("cl.bbcc.mcxt", need);
-    p = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
-    if (rec_index >0)
-	p += VG_(sprintf)(mangled+p, "'%d", rec_index +1);
-    for(i=1;i<cxt->size;i++)
-	p += VG_(sprintf)(mangled+p, "'%s", cxt->fn[i]->name);
+   HChar* mangled = TG_MALLOC("cl.bbcc.mcxt", need);
+   p              = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
+   if (rec_index > 0)
+      p += VG_(sprintf)(mangled + p, "'%d", rec_index + 1);
+   for (i = 1; i < cxt->size; i++)
+      p += VG_(sprintf)(mangled + p, "'%s", cxt->fn[i]->name);
 
-    return mangled;
+   return mangled;
 }
 
-
 /* Create a new BBCC as a copy of an existing one,
  * but with costs set to 0 and jcc chains empty.
  *
@@ -317,73 +300,67 @@ static HChar* mangled_cxt(const Context* cxt, Int rec_index)
  */
 static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
 {
-    BBCC* bbcc;
+   BBCC* bbcc;
 
-    TG_DEBUG(3,"+ clone_bbcc(BB %#lx, rec %d, fn %s)\n",
-	     bb_addr(orig->bb), rec_index, cxt->fn[0]->name);
+   TG_DEBUG(3, "+ clone_bbcc(BB %#lx, rec %d, fn %s)\n", bb_addr(orig->bb),
+            rec_index, cxt->fn[0]->name);
 
-    bbcc = new_bbcc(orig->bb);
+   bbcc = new_bbcc(orig->bb);
 
-    if (rec_index == 0) {
+   if (rec_index == 0) {
 
       /* hash insertion is only allowed if tid or cxt is different */
-      TG_ASSERT((orig->tid != TG_(current_tid)) ||
-		(orig->cxt != cxt));
+      TG_ASSERT((orig->tid != TG_(current_tid)) || (orig->cxt != cxt));
 
-      bbcc->rec_index = 0;
-      bbcc->cxt = cxt;
-      bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions);
+      bbcc->rec_index    = 0;
+      bbcc->cxt          = cxt;
+      bbcc->rec_array    = new_recursion(cxt->fn[0]->separate_recursions);
       bbcc->rec_array[0] = bbcc;
 
       insert_bbcc_into_hash(bbcc);
-    }
-    else {
+   } else {
       if (TG_(clo).separate_threads)
-	TG_ASSERT(orig->tid == TG_(current_tid));
+         TG_ASSERT(orig->tid == TG_(current_tid));
 
       TG_ASSERT(orig->cxt == cxt);
       TG_ASSERT(orig->rec_array);
       TG_ASSERT(cxt->fn[0]->separate_recursions > rec_index);
-      TG_ASSERT(orig->rec_array[rec_index] ==0);
+      TG_ASSERT(orig->rec_array[rec_index] == 0);
 
       /* new BBCC will only have differing recursion level */
-      bbcc->rec_index = rec_index;
-      bbcc->cxt = cxt;
-      bbcc->rec_array = orig->rec_array;
+      bbcc->rec_index            = rec_index;
+      bbcc->cxt                  = cxt;
+      bbcc->rec_array            = orig->rec_array;
       bbcc->rec_array[rec_index] = bbcc;
-    }
+   }
 
-    /* update list of BBCCs for same BB */
-    bbcc->next_bbcc = orig->bb->bbcc_list;
-    orig->bb->bbcc_list = bbcc;
+   /* update list of BBCCs for same BB */
+   bbcc->next_bbcc     = orig->bb->bbcc_list;
+   orig->bb->bbcc_list = bbcc;
 
+   TG_DEBUGIF(3)
+   TG_(print_bbcc)(-2, bbcc);
 
-    TG_DEBUGIF(3)
-      TG_(print_bbcc)(-2, bbcc);
-
-    HChar *mangled_orig = mangled_cxt(orig->cxt, orig->rec_index);
-    HChar *mangled_bbcc = mangled_cxt(bbcc->cxt, bbcc->rec_index);
-    TG_DEBUG(2,"- clone_BBCC(%p, %d) for BB %#lx\n"
-		"   orig %s\n"
-		"   new  %s\n",
-	     orig, rec_index, bb_addr(orig->bb),
-             mangled_orig,
-             mangled_bbcc);
-    TG_FREE(mangled_orig);
-    TG_FREE(mangled_bbcc);
-
-    TG_(stat).bbcc_clones++;
- 
-    return bbcc;
-};
+   HChar* mangled_orig = mangled_cxt(orig->cxt, orig->rec_index);
+   HChar* mangled_bbcc = mangled_cxt(bbcc->cxt, bbcc->rec_index);
+   TG_DEBUG(2,
+            "- clone_BBCC(%p, %d) for BB %#lx\n"
+            "   orig %s\n"
+            "   new  %s\n",
+            orig, rec_index, bb_addr(orig->bb), mangled_orig, mangled_bbcc);
+   TG_FREE(mangled_orig);
+   TG_FREE(mangled_bbcc);
 
+   TG_(stat).bbcc_clones++;
 
+   return bbcc;
+};
 
 /* Get a pointer to the cost centre structure for given basic block
  * address. If created, the BBCC is inserted into the BBCC hash.
  * Also sets BB_seen_before by reference.
  *
- */ 
+ */
 BBCC* TG_(get_bbcc)(BB* bb)
 {
    BBCC* bbcc;
@@ -393,28 +370,26 @@ BBCC* TG_(get_bbcc)(BB* bb)
    bbcc = bb->bbcc_list;
 
    if (!bbcc) {
-     bbcc = new_bbcc(bb);
+      bbcc = new_bbcc(bb);
 
-     /* initialize BBCC */
-     bbcc->cxt       = 0;
-     bbcc->rec_array = 0;
-     bbcc->rec_index = 0;
+      /* initialize BBCC */
+      bbcc->cxt       = 0;
+      bbcc->rec_array = 0;
+      bbcc->rec_index = 0;
 
-     bbcc->next_bbcc = bb->bbcc_list;
-     bb->bbcc_list = bbcc;
-     bb->last_bbcc = bbcc;
+      bbcc->next_bbcc = bb->bbcc_list;
+      bb->bbcc_list   = bbcc;
+      bb->last_bbcc   = bbcc;
 
-     TG_DEBUGIF(3)
-       TG_(print_bbcc)(-2, bbcc);
+      TG_DEBUGIF(3)
+      TG_(print_bbcc)(-2, bbcc);
    }
 
-   TG_DEBUG(3, "- get_bbcc(BB %#lx): BBCC %p\n",
-		bb_addr(bb), bbcc);
+   TG_DEBUG(3, "- get_bbcc(BB %#lx): BBCC %p\n", bb_addr(bb), bbcc);
 
    return bbcc;
 }
 
-
 /* Tracegrind manages its own call stack for each thread.
  * When leaving a function, a underflow can happen when
  * Tracegrind's tracing was switched on in the middle of
@@ -429,72 +404,70 @@ BBCC* TG_(get_bbcc)(BB* bb)
  */
 static void handleUnderflow(BB* bb)
 {
-  /* RET at top of call stack */
-  BBCC* source_bbcc;
-  BB* source_bb;
-  Bool seen_before;
-  fn_node* caller;
-  int fn_number;
-  unsigned *pactive;
-  call_entry* call_entry_up;
-
-  TG_DEBUG(1,"  Callstack underflow !\n");
-
-  /* we emulate an old call from the function we return to
-   * by using (<return address> -1) */
-  source_bb = TG_(get_bb)(bb_addr(bb)-1, 0, &seen_before);
-  source_bbcc = TG_(get_bbcc)(source_bb);
-
-  /* seen_before can be true if RET from a signal handler */
-  if (!seen_before) {
-    source_bbcc->ecounter_sum = TG_(current_state).collect ? 1 : 0;
-  }
-  else if (TG_(current_state).collect)
-    source_bbcc->ecounter_sum++;
-  
-  /* Force a new top context, will be set active by push_cxt() */
-  TG_(current_fn_stack).top--;
-  TG_(current_state).cxt = 0;
-  caller = TG_(get_fn_node)(bb);
-  TG_(push_cxt)( caller );
-
-  if (!seen_before) {
-    /* set rec array for source BBCC: this is at rec level 1 */
-    source_bbcc->rec_array = new_recursion(caller->separate_recursions);
-    source_bbcc->rec_array[0] = source_bbcc;
-
-    TG_ASSERT(source_bbcc->cxt == 0);
-    source_bbcc->cxt = TG_(current_state).cxt;
-    insert_bbcc_into_hash(source_bbcc);
-  }
-  TG_ASSERT(TG_(current_state).bbcc);
-
-  /* correct active counts */
-  fn_number = TG_(current_state).bbcc->cxt->fn[0]->number;
-  pactive = TG_(get_fn_entry)(fn_number);
-  (*pactive)--;
-
-  /* This assertion is not correct for reentrant
-   * signal handlers */
-  /* TG_ASSERT(*pactive == 0); */
-
-  TG_(current_state).nonskipped = 0; /* we didn't skip this function */
-  /* back to current context */
-  TG_(push_cxt)( TG_(current_state).bbcc->cxt->fn[0] );
-  TG_(push_call_stack)(source_bbcc, 0, TG_(current_state).bbcc,
-		       (Addr)-1, False);
-  call_entry_up = 
-    &(TG_(current_call_stack).entry[TG_(current_call_stack).sp -1]);
-  /* assume this call is lasting since last dump or
-   * for a signal handler since it's call */
-  if (TG_(current_state).sig == 0)
-    TG_(copy_cost)( TG_(sets).full, call_entry_up->enter_cost,
-		    TG_(get_current_thread)()->lastdump_cost );
-  else
-    TG_(zero_cost)( TG_(sets).full, call_entry_up->enter_cost );
+   /* RET at top of call stack */
+   BBCC*       source_bbcc;
+   BB*         source_bb;
+   Bool        seen_before;
+   fn_node*    caller;
+   int         fn_number;
+   unsigned*   pactive;
+   call_entry* call_entry_up;
+
+   TG_DEBUG(1, "  Callstack underflow !\n");
+
+   /* we emulate an old call from the function we return to
+    * by using (<return address> -1) */
+   source_bb   = TG_(get_bb)(bb_addr(bb) - 1, 0, &seen_before);
+   source_bbcc = TG_(get_bbcc)(source_bb);
+
+   /* seen_before can be true if RET from a signal handler */
+   if (!seen_before) {
+      source_bbcc->ecounter_sum = TG_(current_state).collect ? 1 : 0;
+   } else if (TG_(current_state).collect)
+      source_bbcc->ecounter_sum++;
+
+   /* Force a new top context, will be set active by push_cxt() */
+   TG_(current_fn_stack).top--;
+   TG_(current_state).cxt = 0;
+   caller                 = TG_(get_fn_node)(bb);
+   TG_(push_cxt)(caller);
+
+   if (!seen_before) {
+      /* set rec array for source BBCC: this is at rec level 1 */
+      source_bbcc->rec_array    = new_recursion(caller->separate_recursions);
+      source_bbcc->rec_array[0] = source_bbcc;
+
+      TG_ASSERT(source_bbcc->cxt == 0);
+      source_bbcc->cxt = TG_(current_state).cxt;
+      insert_bbcc_into_hash(source_bbcc);
+   }
+   TG_ASSERT(TG_(current_state).bbcc);
+
+   /* correct active counts */
+   fn_number = TG_(current_state).bbcc->cxt->fn[0]->number;
+   pactive   = TG_(get_fn_entry)(fn_number);
+   (*pactive)--;
+
+   /* This assertion is not correct for reentrant
+    * signal handlers */
+   /* TG_ASSERT(*pactive == 0); */
+
+   TG_(current_state).nonskipped = 0; /* we didn't skip this function */
+   /* back to current context */
+   TG_(push_cxt)(TG_(current_state).bbcc->cxt->fn[0]);
+   TG_(push_call_stack)
+   (source_bbcc, 0, TG_(current_state).bbcc, (Addr)-1, False);
+   call_entry_up =
+      &(TG_(current_call_stack).entry[TG_(current_call_stack).sp - 1]);
+   /* assume this call is lasting since last dump or
+    * for a signal handler since it's call */
+   if (TG_(current_state).sig == 0)
+      TG_(copy_cost)
+   (TG_(sets).full, call_entry_up->enter_cost,
+    TG_(get_current_thread)()->lastdump_cost);
+   else TG_(zero_cost)(TG_(sets).full, call_entry_up->enter_cost);
 }
 
-
 /*
  * Helper function called at start of each instrumented BB to setup
  * pointer to costs for current thread/context/recursion level
@@ -503,87 +476,86 @@ static void handleUnderflow(BB* bb)
 VG_REGPARM(1)
 void TG_(setup_bbcc)(BB* bb)
 {
-  BBCC *bbcc, *last_bbcc;
-  Bool  call_emulation = False, delayed_push = False, skip = False;
-  Addr sp;
-  BB* last_bb;
-  ThreadId tid;
-  TgJumpKind jmpkind;
-  Bool isConditionalJump;
-  Int passed = 0, csp;
-  Bool ret_without_call = False;
-  Int popcount_on_return = 1;
-
-  TG_DEBUG(3,"+ setup_bbcc(BB %#lx)\n", bb_addr(bb));
-
-  /* This is needed because thread switches can not reliable be tracked
-   * with callback TG_(run_thread) only: we have otherwise no way to get
-   * the thread ID after a signal handler returns.
-   * This could be removed again if that bug is fixed in Valgrind.
-   * This is in the hot path but hopefully not to costly.
-   */
-  tid = VG_(get_running_tid)();
+   BBCC *     bbcc, *last_bbcc;
+   Bool       call_emulation = False, delayed_push = False, skip = False;
+   Addr       sp;
+   BB*        last_bb;
+   ThreadId   tid;
+   TgJumpKind jmpkind;
+   Bool       isConditionalJump;
+   Int        passed             = 0, csp;
+   Bool       ret_without_call   = False;
+   Int        popcount_on_return = 1;
+
+   TG_DEBUG(3, "+ setup_bbcc(BB %#lx)\n", bb_addr(bb));
+
+   /* This is needed because thread switches can not reliable be tracked
+    * with callback TG_(run_thread) only: we have otherwise no way to get
+    * the thread ID after a signal handler returns.
+    * This could be removed again if that bug is fixed in Valgrind.
+    * This is in the hot path but hopefully not to costly.
+    */
+   tid = VG_(get_running_tid)();
 #if 1
-  /* TG_(switch_thread) is a no-op when tid is equal to TG_(current_tid).
-   * As this is on the hot path, we only call TG_(switch_thread)(tid)
-   * if tid differs from the TG_(current_tid).
-   */
-  if (UNLIKELY(tid != TG_(current_tid)))
-     TG_(switch_thread)(tid);
+   /* TG_(switch_thread) is a no-op when tid is equal to TG_(current_tid).
+    * As this is on the hot path, we only call TG_(switch_thread)(tid)
+    * if tid differs from the TG_(current_tid).
+    */
+   if (UNLIKELY(tid != TG_(current_tid)))
+      TG_(switch_thread)(tid);
 #else
-  TG_ASSERT(VG_(get_running_tid)() == TG_(current_tid));
+   TG_ASSERT(VG_(get_running_tid)() == TG_(current_tid));
 #endif
 
-  sp = VG_(get_SP)(tid);
-  last_bbcc = TG_(current_state).bbcc;
-  last_bb = last_bbcc ? last_bbcc->bb : 0;
+   sp        = VG_(get_SP)(tid);
+   last_bbcc = TG_(current_state).bbcc;
+   last_bb   = last_bbcc ? last_bbcc->bb : 0;
 
-  if (last_bb) {
+   if (last_bb) {
       passed = TG_(current_state).jmps_passed;
       TG_ASSERT(passed <= last_bb->cjmp_count);
-      jmpkind = last_bb->jmp[passed].jmpkind;
+      jmpkind           = last_bb->jmp[passed].jmpkind;
       isConditionalJump = (passed < last_bb->cjmp_count);
 
       if (TG_(current_state).collect) {
-	if (!TG_(current_state).nonskipped) {
-	  last_bbcc->ecounter_sum++;
-	  last_bbcc->jmp[passed].ecounter++;
-	  if (!TG_(clo).simulate_cache) {
-	      /* update Ir cost */              
-              UInt instr_count = last_bb->jmp[passed].instr+1;
-              TG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
-	  }
-	}
-	else {
-	  /* do not increment exe counter of BBs in skipped functions, as it
-	   * would fool dumping code */
-	  if (!TG_(clo).simulate_cache) {
-	      /* update Ir cost */
-              UInt instr_count = last_bb->jmp[passed].instr+1;
-              TG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
-              TG_(current_state).nonskipped->skipped[ fullOffset(EG_IR) ]
-		+= instr_count;
-	  }
-	}
+         if (!TG_(current_state).nonskipped) {
+            last_bbcc->ecounter_sum++;
+            last_bbcc->jmp[passed].ecounter++;
+            if (!TG_(clo).simulate_cache) {
+               /* update Ir cost */
+               UInt instr_count = last_bb->jmp[passed].instr + 1;
+               TG_(current_state).cost[fullOffset(EG_IR)] += instr_count;
+            }
+         } else {
+            /* do not increment exe counter of BBs in skipped functions, as it
+             * would fool dumping code */
+            if (!TG_(clo).simulate_cache) {
+               /* update Ir cost */
+               UInt instr_count = last_bb->jmp[passed].instr + 1;
+               TG_(current_state).cost[fullOffset(EG_IR)] += instr_count;
+               TG_(current_state).nonskipped->skipped[fullOffset(EG_IR)] +=
+                  instr_count;
+            }
+         }
       }
 
-      TG_DEBUGIF(4) {
-	  TG_(print_execstate)(-2, &TG_(current_state) );
-	  TG_(print_bbcc_cost)(-2, last_bbcc);
+      TG_DEBUGIF(4)
+      {
+         TG_(print_execstate)(-2, &TG_(current_state));
+         TG_(print_bbcc_cost)(-2, last_bbcc);
       }
-  }
-  else {
-      jmpkind = jk_None;
+   } else {
+      jmpkind           = jk_None;
       isConditionalJump = False;
-  }
+   }
 
-  /* Manipulate JmpKind if needed, only using BB specific info */
+   /* Manipulate JmpKind if needed, only using BB specific info */
 
-  csp = TG_(current_call_stack).sp;
+   csp = TG_(current_call_stack).sp;
 
-  /* A return not matching the top call in our callstack is a jump */
-  if ( (jmpkind == jk_Return) && (csp >0)) {
-      Int csp_up = csp-1;      
+   /* A return not matching the top call in our callstack is a jump */
+   if ((jmpkind == jk_Return) && (csp > 0)) {
+      Int         csp_up = csp - 1;
       call_entry* top_ce = &(TG_(current_call_stack).entry[csp_up]);
 
       /* We have a real return if
@@ -595,296 +567,298 @@ void TG_(setup_bbcc)(BB* bb)
        * The latter condition is needed because on PPC, SP can stay
        * the same over CALL=b(c)l / RET=b(c)lr boundaries
        */
-      if (sp < top_ce->sp) popcount_on_return = 0;
+      if (sp < top_ce->sp)
+         popcount_on_return = 0;
       else if (top_ce->sp == sp) {
-	  while(1) {
-	      if (top_ce->ret_addr == bb_addr(bb)) break;
-	      if (csp_up>0) {
-		  csp_up--;
-		  top_ce = &(TG_(current_call_stack).entry[csp_up]);
-		  if (top_ce->sp == sp) {
-		      popcount_on_return++;
-		      continue; 
-		  }
-	      }
-	      popcount_on_return = 0;
-	      break;
-	  }
+         while (1) {
+            if (top_ce->ret_addr == bb_addr(bb))
+               break;
+            if (csp_up > 0) {
+               csp_up--;
+               top_ce = &(TG_(current_call_stack).entry[csp_up]);
+               if (top_ce->sp == sp) {
+                  popcount_on_return++;
+                  continue;
+               }
+            }
+            popcount_on_return = 0;
+            break;
+         }
       }
       if (popcount_on_return == 0) {
-	  jmpkind = jk_Jump;
-	  ret_without_call = True;
+         jmpkind          = jk_Jump;
+         ret_without_call = True;
       }
-  }
-
-  /* Should this jump be converted to call or pop/call ? */
-  if (( jmpkind != jk_Return) &&
-      ( jmpkind != jk_Call) && last_bb) {
-
-    /* We simulate a JMP/Cont to be a CALL if
-     * - jump is in another ELF object or section kind
-     * - jump is to first instruction of a function (tail recursion)
-     */
-    if (ret_without_call ||
-	/* This is for detection of optimized tail recursion.
-	 * On PPC, this is only detected as call when going to another
-	 * function. The problem is that on PPC it can go wrong
-	 * more easily (no stack frame setup needed)
-	 */
+   }
+
+   /* Should this jump be converted to call or pop/call ? */
+   if ((jmpkind != jk_Return) && (jmpkind != jk_Call) && last_bb) {
+
+      /* We simulate a JMP/Cont to be a CALL if
+       * - jump is in another ELF object or section kind
+       * - jump is to first instruction of a function (tail recursion)
+       */
+      if (ret_without_call ||
+      /* This is for detection of optimized tail recursion.
+       * On PPC, this is only detected as call when going to another
+       * function. The problem is that on PPC it can go wrong
+       * more easily (no stack frame setup needed)
+       */
 #if defined(VGA_ppc32)
-	(bb->is_entry && (last_bb->fn != bb->fn)) ||
+          (bb->is_entry && (last_bb->fn != bb->fn)) ||
 #else
-	bb->is_entry ||
+          bb->is_entry ||
 #endif
-	(last_bb->sect_kind != bb->sect_kind) ||
-	(last_bb->obj->number != bb->obj->number)) {
-
-	TG_DEBUG(1,"     JMP: %s[%s] to %s[%s]%s!\n",
-		  last_bb->fn->name, last_bb->obj->name,
-		  bb->fn->name, bb->obj->name,
-		  ret_without_call?" (RET w/o CALL)":"");
-
-	if (TG_(get_fn_node)(last_bb)->pop_on_jump && (csp>0)) {
-
-	    call_entry* top_ce = &(TG_(current_call_stack).entry[csp-1]);
-	    
-	    if (top_ce->jcc) {
-
-		TG_DEBUG(1,"     Pop on Jump!\n");
-
-		/* change source for delayed push */
-		TG_(current_state).bbcc = top_ce->jcc->from;
-		sp = top_ce->sp;
-		passed = top_ce->jcc->jmp;
-		TG_(pop_call_stack)();
-	    }
-	    else {
-		TG_ASSERT(TG_(current_state).nonskipped != 0);
-	    }
-	}
-
-	jmpkind = jk_Call;
-	call_emulation = True;
-    }
-  }
-
-  if (jmpkind == jk_Call) {
-    fn_node* node = TG_(get_fn_node)(bb);
-    skip = node->skip;
-  }
-
-  TG_DEBUGIF(1) {
-    if (isConditionalJump)
-      VG_(printf)("Cond-");
-    switch(jmpkind) {
-    case jk_None:   VG_(printf)("Fall-through"); break;
-    case jk_Jump:   VG_(printf)("Jump"); break;
-    case jk_Call:   VG_(printf)("Call"); break;
-    case jk_Return: VG_(printf)("Return"); break;
-    default:        tl_assert(0);
-    }
-    VG_(printf)(" %08lx -> %08lx, SP %08lx\n",
-		last_bb ? bb_jmpaddr(last_bb) : 0,
-		bb_addr(bb), sp);
-  }
-
-  /* Handle CALL/RET and update context to get correct BBCC */
-  
-  if (jmpkind == jk_Return) {
-    
-    if ((csp == 0) || 
-	((TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) &&
-	 ( *(TG_(current_fn_stack).top-1)==0)) ) {
-
-      /* On an empty call stack or at a signal separation marker,
-       * a RETURN generates an call stack underflow.
-       */	
-      handleUnderflow(bb);
-      TG_(pop_call_stack)();
-    }
-    else {
-	TG_ASSERT(popcount_on_return >0);
-	TG_(unwind_call_stack)(sp, popcount_on_return);
-    }
-  }
-  else {
-    Int unwind_count = TG_(unwind_call_stack)(sp, 0);
-    if (unwind_count > 0) {
-      /* if unwinding was done, this actually is a return */
-      jmpkind = jk_Return;
-    }
-    
-    if (jmpkind == jk_Call) {
-      delayed_push = True;
-
-      csp = TG_(current_call_stack).sp;
-      if (call_emulation && csp>0)
-	sp = TG_(current_call_stack).entry[csp-1].sp;	
-
-    }
-  }
-  
-  /* Change new context if needed, taking delayed_push into account */
-  if ((delayed_push && !skip) || (TG_(current_state).cxt == 0)) {
-    TG_(push_cxt)(TG_(get_fn_node)(bb));
-  }
-  TG_ASSERT(TG_(current_fn_stack).top > TG_(current_fn_stack).bottom);
-  
-  /* If there is a fresh instrumented BBCC, assign current context */
-  bbcc = TG_(get_bbcc)(bb);
-  if (bbcc->cxt == 0) {
-    TG_ASSERT(bbcc->rec_array == 0);
-      
-    bbcc->cxt = TG_(current_state).cxt;
-    bbcc->rec_array = 
-      new_recursion((*TG_(current_fn_stack).top)->separate_recursions);
-    bbcc->rec_array[0] = bbcc;
-      
-    insert_bbcc_into_hash(bbcc);
-  }
-  else {
-    /* get BBCC with current context */
-    
-    /* first check LRU of last bbcc executed */
-    
-    if (last_bbcc) {
-      bbcc = last_bbcc->lru_next_bbcc;
-      if (bbcc &&
-	  ((bbcc->bb != bb) ||
-	   (bbcc->cxt != TG_(current_state).cxt)))
-	bbcc = 0;
-    }
-    else
-      bbcc = 0;
-
-    if (!bbcc)
-      bbcc = lookup_bbcc(bb, TG_(current_state).cxt);
-    if (!bbcc)
-      bbcc = clone_bbcc(bb->bbcc_list, TG_(current_state).cxt, 0);
-    
-    bb->last_bbcc = bbcc;
-  }
-
-  /* save for fast lookup */
-  if (last_bbcc)
-    last_bbcc->lru_next_bbcc = bbcc;
-
-  if ((*TG_(current_fn_stack).top)->separate_recursions >1) {
-    UInt level, idx;
-    fn_node* top = *(TG_(current_fn_stack).top);
-
-    level = *TG_(get_fn_entry)(top->number);
-
-    if (delayed_push && !skip) {
-      level++;
-    }
-    if (level> top->separate_recursions)
-      level = top->separate_recursions;
-
-    if (level == 0) {
-      /* can only happen if instrumentation just was switched on */
-      level = 1;
-      *TG_(get_fn_entry)(top->number) = 1;
-    }
-
-    idx = level -1;
-    if (bbcc->rec_array[idx])
-      bbcc = bbcc->rec_array[idx];
-    else
-      bbcc = clone_bbcc(bbcc, TG_(current_state).cxt, idx);
-
-    TG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
-  }
-
-  if (delayed_push) {
-    if (!skip && TG_(current_state).nonskipped) {
-      /* a call from skipped to nonskipped */
-      TG_(current_state).bbcc = TG_(current_state).nonskipped;
-      /* FIXME: take the real passed count from shadow stack */
-      passed = TG_(current_state).bbcc->bb->cjmp_count;
-    }
-    TG_(push_call_stack)(TG_(current_state).bbcc, passed,
-			 bbcc, sp, skip);
-  }
-
-  if (TG_(clo).collect_jumps && (jmpkind == jk_Jump)) {
-    
-    /* Handle conditional jumps followed, i.e. trace arcs
-     * This uses JCC structures, too */
-    
-    jCC* jcc = TG_(get_jcc)(last_bbcc, passed, bbcc);
-    TG_ASSERT(jcc != 0);
-    // Change from default, and check if already changed
-    if (jcc->jmpkind == jk_Call)
-      jcc->jmpkind = isConditionalJump ? jk_CondJump : jk_Jump;
-    else {
-	// FIXME: Why can this fail?
-	// TG_ASSERT(jcc->jmpkind == jmpkind);
-    }
-    
-    jcc->call_counter++;
-    if (isConditionalJump)
-      TG_(stat).jcnd_counter++;
-    else
-      TG_(stat).jump_counter++;
-  }
-  
-  TG_(current_state).bbcc = bbcc;
-
-  /* Check for inline function transitions */
-  if (TG_(current_state).collect) {
+          (last_bb->sect_kind != bb->sect_kind) ||
+          (last_bb->obj->number != bb->obj->number)) {
+
+         TG_DEBUG(1, "     JMP: %s[%s] to %s[%s]%s!\n", last_bb->fn->name,
+                  last_bb->obj->name, bb->fn->name, bb->obj->name,
+                  ret_without_call ? " (RET w/o CALL)" : "");
+
+         if (TG_(get_fn_node)(last_bb)->pop_on_jump && (csp > 0)) {
+
+            call_entry* top_ce = &(TG_(current_call_stack).entry[csp - 1]);
+
+            if (top_ce->jcc) {
+
+               TG_DEBUG(1, "     Pop on Jump!\n");
+
+               /* change source for delayed push */
+               TG_(current_state).bbcc = top_ce->jcc->from;
+               sp                      = top_ce->sp;
+               passed                  = top_ce->jcc->jmp;
+               TG_(pop_call_stack)();
+            } else {
+               TG_ASSERT(TG_(current_state).nonskipped != 0);
+            }
+         }
+
+         jmpkind        = jk_Call;
+         call_emulation = True;
+      }
+   }
+
+   if (jmpkind == jk_Call) {
+      fn_node* node = TG_(get_fn_node)(bb);
+      skip          = node->skip;
+   }
+
+   TG_DEBUGIF(1)
+   {
+      if (isConditionalJump)
+         VG_(printf)("Cond-");
+      switch (jmpkind) {
+      case jk_None:
+         VG_(printf)("Fall-through");
+         break;
+      case jk_Jump:
+         VG_(printf)("Jump");
+         break;
+      case jk_Call:
+         VG_(printf)("Call");
+         break;
+      case jk_Return:
+         VG_(printf)("Return");
+         break;
+      default:
+         tl_assert(0);
+      }
+      VG_(printf)(" %08lx -> %08lx, SP %08lx\n",
+                  last_bb ? bb_jmpaddr(last_bb) : 0, bb_addr(bb), sp);
+   }
+
+   /* Handle CALL/RET and update context to get correct BBCC */
+
+   if (jmpkind == jk_Return) {
+
+      if ((csp == 0) ||
+          ((TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) &&
+           (*(TG_(current_fn_stack).top - 1) == 0))) {
+
+         /* On an empty call stack or at a signal separation marker,
+          * a RETURN generates an call stack underflow.
+          */
+         handleUnderflow(bb);
+         TG_(pop_call_stack)();
+      } else {
+         TG_ASSERT(popcount_on_return > 0);
+         TG_(unwind_call_stack)(sp, popcount_on_return);
+      }
+   } else {
+      Int unwind_count = TG_(unwind_call_stack)(sp, 0);
+      if (unwind_count > 0) {
+         /* if unwinding was done, this actually is a return */
+         jmpkind = jk_Return;
+      }
+
+      if (jmpkind == jk_Call) {
+         delayed_push = True;
+
+         csp = TG_(current_call_stack).sp;
+         if (call_emulation && csp > 0)
+            sp = TG_(current_call_stack).entry[csp - 1].sp;
+      }
+   }
+
+   /* Change new context if needed, taking delayed_push into account */
+   if ((delayed_push && !skip) || (TG_(current_state).cxt == 0)) {
+      TG_(push_cxt)(TG_(get_fn_node)(bb));
+   }
+   TG_ASSERT(TG_(current_fn_stack).top > TG_(current_fn_stack).bottom);
+
+   /* If there is a fresh instrumented BBCC, assign current context */
+   bbcc = TG_(get_bbcc)(bb);
+   if (bbcc->cxt == 0) {
+      TG_ASSERT(bbcc->rec_array == 0);
+
+      bbcc->cxt = TG_(current_state).cxt;
+      bbcc->rec_array =
+         new_recursion((*TG_(current_fn_stack).top)->separate_recursions);
+      bbcc->rec_array[0] = bbcc;
+
+      insert_bbcc_into_hash(bbcc);
+   } else {
+      /* get BBCC with current context */
+
+      /* first check LRU of last bbcc executed */
+
+      if (last_bbcc) {
+         bbcc = last_bbcc->lru_next_bbcc;
+         if (bbcc &&
+             ((bbcc->bb != bb) || (bbcc->cxt != TG_(current_state).cxt)))
+            bbcc = 0;
+      } else
+         bbcc = 0;
+
+      if (!bbcc)
+         bbcc = lookup_bbcc(bb, TG_(current_state).cxt);
+      if (!bbcc)
+         bbcc = clone_bbcc(bb->bbcc_list, TG_(current_state).cxt, 0);
+
+      bb->last_bbcc = bbcc;
+   }
+
+   /* save for fast lookup */
+   if (last_bbcc)
+      last_bbcc->lru_next_bbcc = bbcc;
+
+   if ((*TG_(current_fn_stack).top)->separate_recursions > 1) {
+      UInt     level, idx;
+      fn_node* top = *(TG_(current_fn_stack).top);
+
+      level = *TG_(get_fn_entry)(top->number);
+
+      if (delayed_push && !skip) {
+         level++;
+      }
+      if (level > top->separate_recursions)
+         level = top->separate_recursions;
+
+      if (level == 0) {
+         /* can only happen if instrumentation just was switched on */
+         level                           = 1;
+         *TG_(get_fn_entry)(top->number) = 1;
+      }
+
+      idx = level - 1;
+      if (bbcc->rec_array[idx])
+         bbcc = bbcc->rec_array[idx];
+      else
+         bbcc = clone_bbcc(bbcc, TG_(current_state).cxt, idx);
+
+      TG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
+   }
+
+   if (delayed_push) {
+      if (!skip && TG_(current_state).nonskipped) {
+         /* a call from skipped to nonskipped */
+         TG_(current_state).bbcc = TG_(current_state).nonskipped;
+         /* FIXME: take the real passed count from shadow stack */
+         passed = TG_(current_state).bbcc->bb->cjmp_count;
+      }
+      TG_(push_call_stack)(TG_(current_state).bbcc, passed, bbcc, sp, skip);
+   }
+
+   if (TG_(clo).collect_jumps && (jmpkind == jk_Jump)) {
+
+      /* Handle conditional jumps followed, i.e. trace arcs
+       * This uses JCC structures, too */
+
+      jCC* jcc = TG_(get_jcc)(last_bbcc, passed, bbcc);
+      TG_ASSERT(jcc != 0);
+      // Change from default, and check if already changed
+      if (jcc->jmpkind == jk_Call)
+         jcc->jmpkind = isConditionalJump ? jk_CondJump : jk_Jump;
+      else {
+         // FIXME: Why can this fail?
+         // TG_ASSERT(jcc->jmpkind == jmpkind);
+      }
+
+      jcc->call_counter++;
+      if (isConditionalJump)
+         TG_(stat).jcnd_counter++;
+      else
+         TG_(stat).jump_counter++;
+   }
+
+   TG_(current_state).bbcc = bbcc;
+
+   /* Check for inline function transitions */
+   if (TG_(current_state).collect) {
       thread_info* ti = TG_(get_current_thread)();
       if (ti) {
-          UInt old_depth = ti->cur_inl_depth;
-          UInt new_depth = bb->inl_depth;
-
-          /* Fast path: both empty (most BBs) */
-          if (old_depth != 0 || new_depth != 0) {
-              /* Find longest common prefix */
-              UInt common = 0;
-              UInt min_depth = old_depth < new_depth ? old_depth : new_depth;
-              while (common < min_depth &&
-                     ti->cur_inl_fns[common] == bb->inl_fns[common])
-                  common++;
-
-              /* EXIT from deepest down to common level */
-              for (Int i = (Int)old_depth - 1; i >= (Int)common; i--)
-                  TG_(trace_emit_exit_inlined)(TG_(current_tid), bb,
-                                                ti->cur_inl_fns[i]);
-
-              /* ENTER from common level up to new deepest */
-              for (UInt i = common; i < new_depth; i++)
-                  TG_(trace_emit_enter_inlined)(TG_(current_tid), bb,
-                                                 bb->inl_fns[i]);
-
-              /* Update thread state */
-              for (UInt i = 0; i < new_depth; i++)
-                  ti->cur_inl_fns[i] = bb->inl_fns[i];
-              ti->cur_inl_depth = new_depth;
-          }
+         UInt old_depth = ti->cur_inl_depth;
+         UInt new_depth = bb->inl_depth;
+
+         /* Fast path: both empty (most BBs) */
+         if (old_depth != 0 || new_depth != 0) {
+            /* Find longest common prefix */
+            UInt common    = 0;
+            UInt min_depth = old_depth < new_depth ? old_depth : new_depth;
+            while (common < min_depth &&
+                   ti->cur_inl_fns[common] == bb->inl_fns[common])
+               common++;
+
+            /* EXIT from deepest down to common level */
+            for (Int i = (Int)old_depth - 1; i >= (Int)common; i--)
+               TG_(trace_emit_exit_inlined)
+            (TG_(current_tid), bb, ti->cur_inl_fns[i]);
+
+            /* ENTER from common level up to new deepest */
+            for (UInt i = common; i < new_depth; i++)
+               TG_(trace_emit_enter_inlined)
+            (TG_(current_tid), bb, bb->inl_fns[i]);
+
+            /* Update thread state */
+            for (UInt i = 0; i < new_depth; i++)
+               ti->cur_inl_fns[i] = bb->inl_fns[i];
+            ti->cur_inl_depth = new_depth;
+         }
       }
-  }
-
-  /* Even though this will be set in instrumented code directly before
-   * side exits, it needs to be set to 0 here in case an exception
-   * happens in first instructions of the BB */
-  TG_(current_state).jmps_passed = 0;
-  // needed for log_* handlers called in this BB
-  TG_(bb_base)   = bb->obj->offset + bb->offset;
-  TG_(cost_base) = bbcc->cost;
-  
-  TG_DEBUGIF(1) {
-    VG_(printf)("     ");
-    TG_(print_bbcc_fn)(bbcc);
-    VG_(printf)("\n");
-  }
-  
-  TG_DEBUG(3,"- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n",
-	   bb_addr(bb), bbcc->cost, bb->cost_count, 
-	   bb->instr_count, bb->instr_len);
-  TG_DEBUGIF(3)
-    TG_(print_cxt)(-8, TG_(current_state).cxt, bbcc->rec_index);
-  TG_DEBUG(3,"\n");
-  
-  TG_(stat).bb_executions++;
+   }
+
+   /* Even though this will be set in instrumented code directly before
+    * side exits, it needs to be set to 0 here in case an exception
+    * happens in first instructions of the BB */
+   TG_(current_state).jmps_passed = 0;
+   // needed for log_* handlers called in this BB
+   TG_(bb_base)   = bb->obj->offset + bb->offset;
+   TG_(cost_base) = bbcc->cost;
+
+   TG_DEBUGIF(1)
+   {
+      VG_(printf)("     ");
+      TG_(print_bbcc_fn)(bbcc);
+      VG_(printf)("\n");
+   }
+
+   TG_DEBUG(3, "- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n",
+            bb_addr(bb), bbcc->cost, bb->cost_count, bb->instr_count,
+            bb->instr_len);
+   TG_DEBUGIF(3)
+   TG_(print_cxt)(-8, TG_(current_state).cxt, bbcc->rec_index);
+   TG_DEBUG(3, "\n");
+
+   TG_(stat).bb_executions++;
 }
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
index 1cc288dfc..1cf056a3f 100644
--- a/tracegrind/callstack.c
+++ b/tracegrind/callstack.c
@@ -33,7 +33,7 @@
 /* Stack of current thread. Gets initialized when switching to 1st thread.
  *
  * The artificial call stack is an array of call_entry's, representing
- * stack frames of the executing program. 
+ * stack frames of the executing program.
  * Array call_stack and call_stack_esp have same size and grow on demand.
  * Array call_stack_esp holds SPs of corresponding stack frames.
  *
@@ -45,121 +45,114 @@ call_stack TG_(current_call_stack);
 
 void TG_(init_call_stack)(call_stack* s)
 {
-  Int i;
+   Int i;
 
-  TG_ASSERT(s != 0);
+   TG_ASSERT(s != 0);
 
-  s->size = N_CALL_STACK_INITIAL_ENTRIES;   
-  s->entry = (call_entry*) TG_MALLOC("cl.callstack.ics.1",
-                                      s->size * sizeof(call_entry));
-  s->sp = 0;
-  s->entry[0].cxt = 0; /* for assertion in push_cxt() */
+   s->size         = N_CALL_STACK_INITIAL_ENTRIES;
+   s->entry        = (call_entry*)TG_MALLOC("cl.callstack.ics.1",
+                                            s->size * sizeof(call_entry));
+   s->sp           = 0;
+   s->entry[0].cxt = 0; /* for assertion in push_cxt() */
 
-  for(i=0; i<s->size; i++) s->entry[i].enter_cost = 0;
+   for (i = 0; i < s->size; i++)
+      s->entry[i].enter_cost = 0;
 }
 
 call_entry* TG_(get_call_entry)(Int sp)
 {
-  TG_ASSERT(sp <= TG_(current_call_stack).sp);
-  return &(TG_(current_call_stack).entry[sp]);
+   TG_ASSERT(sp <= TG_(current_call_stack).sp);
+   return &(TG_(current_call_stack).entry[sp]);
 }
 
 void TG_(copy_current_call_stack)(call_stack* dst)
 {
-  TG_ASSERT(dst != 0);
+   TG_ASSERT(dst != 0);
 
-  dst->size  = TG_(current_call_stack).size;
-  dst->entry = TG_(current_call_stack).entry;
-  dst->sp    = TG_(current_call_stack).sp;
+   dst->size  = TG_(current_call_stack).size;
+   dst->entry = TG_(current_call_stack).entry;
+   dst->sp    = TG_(current_call_stack).sp;
 }
 
 void TG_(set_current_call_stack)(call_stack* s)
 {
-  TG_ASSERT(s != 0);
+   TG_ASSERT(s != 0);
 
-  TG_(current_call_stack).size  = s->size;
-  TG_(current_call_stack).entry = s->entry;
-  TG_(current_call_stack).sp    = s->sp;
+   TG_(current_call_stack).size  = s->size;
+   TG_(current_call_stack).entry = s->entry;
+   TG_(current_call_stack).sp    = s->sp;
 }
 
-
-static __inline__
-void ensure_stack_size(Int i)
+static __inline__ void ensure_stack_size(Int i)
 {
-  Int oldsize;
-  call_stack *cs = &TG_(current_call_stack);
-
-  if (i < cs->size) return;
+   Int         oldsize;
+   call_stack* cs = &TG_(current_call_stack);
 
-  oldsize = cs->size;
-  cs->size *= 2;
-  while (i > cs->size) cs->size *= 2;
+   if (i < cs->size)
+      return;
 
-  cs->entry = (call_entry*) VG_(realloc)("cl.callstack.ess.1",
-                                         cs->entry,
-					 cs->size * sizeof(call_entry));
+   oldsize = cs->size;
+   cs->size *= 2;
+   while (i > cs->size)
+      cs->size *= 2;
 
-  for(i=oldsize; i<cs->size; i++)
-    cs->entry[i].enter_cost = 0;
+   cs->entry = (call_entry*)VG_(realloc)("cl.callstack.ess.1", cs->entry,
+                                         cs->size * sizeof(call_entry));
 
-  TG_(stat).call_stack_resizes++;
- 
-  TG_DEBUGIF(2)
-    VG_(printf)("        call stack enlarged to %u entries\n",
-		TG_(current_call_stack).size);
-}
+   for (i = oldsize; i < cs->size; i++)
+      cs->entry[i].enter_cost = 0;
 
+   TG_(stat).call_stack_resizes++;
 
+   TG_DEBUGIF(2)
+   VG_(printf)("        call stack enlarged to %u entries\n",
+               TG_(current_call_stack).size);
+}
 
 /* Called when function entered nonrecursive */
 static void function_entered(fn_node* fn)
 {
-  TG_ASSERT(fn != 0);
+   TG_ASSERT(fn != 0);
 
 #if TG_ENABLE_DEBUG
-  if (fn->verbosity >=0) {
-    Int old = TG_(clo).verbose;
-    TG_(clo).verbose = fn->verbosity;
-    fn->verbosity = old;
-    VG_(message)(Vg_DebugMsg, 
-		 "Entering %s: Verbosity set to %d\n",
-		 fn->name, TG_(clo).verbose);
-  }
-#endif		
-	    
-  if (fn->toggle_collect) {
-    TG_(current_state).collect = !TG_(current_state).collect;
-    TG_DEBUG(2,"   entering %s: toggled collection state to %s\n",
-	     fn->name,
-	     TG_(current_state).collect ? "ON" : "OFF");
-  }
-}	
+   if (fn->verbosity >= 0) {
+      Int old          = TG_(clo).verbose;
+      TG_(clo).verbose = fn->verbosity;
+      fn->verbosity    = old;
+      VG_(message)(Vg_DebugMsg, "Entering %s: Verbosity set to %d\n", fn->name,
+                   TG_(clo).verbose);
+   }
+#endif
+
+   if (fn->toggle_collect) {
+      TG_(current_state).collect = !TG_(current_state).collect;
+      TG_DEBUG(2, "   entering %s: toggled collection state to %s\n", fn->name,
+               TG_(current_state).collect ? "ON" : "OFF");
+   }
+}
 
 /* Called when function left (no recursive level active) */
 static void function_left(fn_node* fn)
 {
-  TG_ASSERT(fn != 0);
+   TG_ASSERT(fn != 0);
 
-  if (fn->toggle_collect) {
-    TG_(current_state).collect = !TG_(current_state).collect;
-    TG_DEBUG(2,"   leaving %s: toggled collection state to %s\n",
-	     fn->name,
-	     TG_(current_state).collect ? "ON" : "OFF");
-  }
+   if (fn->toggle_collect) {
+      TG_(current_state).collect = !TG_(current_state).collect;
+      TG_DEBUG(2, "   leaving %s: toggled collection state to %s\n", fn->name,
+               TG_(current_state).collect ? "ON" : "OFF");
+   }
 
 #if TG_ENABLE_DEBUG
-  if (fn->verbosity >=0) {
-    Int old = TG_(clo).verbose;
-    TG_(clo).verbose = fn->verbosity;
-    fn->verbosity = old;
-    VG_(message)(Vg_DebugMsg, 
-		 "Leaving %s: Verbosity set back to %d\n",
-		 fn->name, TG_(clo).verbose);
-  }
-#endif		
+   if (fn->verbosity >= 0) {
+      Int old          = TG_(clo).verbose;
+      TG_(clo).verbose = fn->verbosity;
+      fn->verbosity    = old;
+      VG_(message)(Vg_DebugMsg, "Leaving %s: Verbosity set back to %d\n",
+                   fn->name, TG_(clo).verbose);
+   }
+#endif
 }
 
-
 /* Push call on call stack.
  *
  * Increment the usage count for the function called.
@@ -169,138 +162,132 @@ static void function_left(fn_node* fn)
  */
 void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
 {
-    jCC* jcc;
-    UInt* pdepth;
-    call_entry* current_entry;
-    Addr ret_addr;
-
-    /* Ensure a call stack of size <current_sp>+1.
-     * The +1 is needed as push_cxt will store the
-     * context at [current_sp]
-     */
-    ensure_stack_size(TG_(current_call_stack).sp +1);
-    current_entry = &(TG_(current_call_stack).entry[TG_(current_call_stack).sp]);
-
-    if (skip) {
-	jcc = 0;
-    }
-    else {
-	fn_node* to_fn = to->cxt->fn[0];
-
-	if (TG_(current_state).nonskipped) {
-	    /* this is a jmp from skipped to nonskipped */
-	    TG_ASSERT(TG_(current_state).nonskipped == from);
-	}
-
-	/* As push_cxt() has to be called before push_call_stack if not
-	 * skipping, the old context should already be saved on the stack */
-	TG_ASSERT(current_entry->cxt != 0);
-	TG_(copy_cost_lz)( TG_(sets).full, &(current_entry->enter_cost),
-			   TG_(current_state).cost );
-
-	jcc = TG_(get_jcc)(from, jmp, to);
-	TG_ASSERT(jcc != 0);
-
-	pdepth = TG_(get_fn_entry)(to_fn->number);
-	(*pdepth)++;
-
-	if (*pdepth>1)
-	  TG_(stat).rec_call_counter++;
-	
-	jcc->call_counter++;
-	TG_(stat).call_counter++;
-
-	if (*pdepth == 1) function_entered(to_fn);
-    }
-
-    /* return address is only is useful with a real call;
-     * used to detect RET w/o CALL */
-    if (from->bb->jmp[jmp].jmpkind == jk_Call) {
+   jCC*        jcc;
+   UInt*       pdepth;
+   call_entry* current_entry;
+   Addr        ret_addr;
+
+   /* Ensure a call stack of size <current_sp>+1.
+    * The +1 is needed as push_cxt will store the
+    * context at [current_sp]
+    */
+   ensure_stack_size(TG_(current_call_stack).sp + 1);
+   current_entry = &(TG_(current_call_stack).entry[TG_(current_call_stack).sp]);
+
+   if (skip) {
+      jcc = 0;
+   } else {
+      fn_node* to_fn = to->cxt->fn[0];
+
+      if (TG_(current_state).nonskipped) {
+         /* this is a jmp from skipped to nonskipped */
+         TG_ASSERT(TG_(current_state).nonskipped == from);
+      }
+
+      /* As push_cxt() has to be called before push_call_stack if not
+       * skipping, the old context should already be saved on the stack */
+      TG_ASSERT(current_entry->cxt != 0);
+      TG_(copy_cost_lz)
+      (TG_(sets).full, &(current_entry->enter_cost), TG_(current_state).cost);
+
+      jcc = TG_(get_jcc)(from, jmp, to);
+      TG_ASSERT(jcc != 0);
+
+      pdepth = TG_(get_fn_entry)(to_fn->number);
+      (*pdepth)++;
+
+      if (*pdepth > 1)
+         TG_(stat).rec_call_counter++;
+
+      jcc->call_counter++;
+      TG_(stat).call_counter++;
+
+      if (*pdepth == 1)
+         function_entered(to_fn);
+   }
+
+   /* return address is only is useful with a real call;
+    * used to detect RET w/o CALL */
+   if (from->bb->jmp[jmp].jmpkind == jk_Call) {
       UInt instr = from->bb->jmp[jmp].instr;
-      ret_addr = bb_addr(from->bb) +
-	from->bb->instr[instr].instr_offset +
-	from->bb->instr[instr].instr_size;
-    }
-    else
+      ret_addr   = bb_addr(from->bb) + from->bb->instr[instr].instr_offset +
+                 from->bb->instr[instr].instr_size;
+   } else
       ret_addr = 0;
 
-    /* put jcc on call stack */
-    current_entry->jcc = jcc;
-    current_entry->sp = sp;
-    current_entry->ret_addr = ret_addr;
-    current_entry->nonskipped = TG_(current_state).nonskipped;
-
-    TG_(current_call_stack).sp++;
-
-    /* Emit trace sample on function entry */
-    if (!skip && TG_(current_state).collect) {
-	/* Exit entire inline stack, deepest first */
-	thread_info* ti = TG_(get_current_thread)();
-	if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
-	    for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
-		TG_(trace_emit_exit_inlined)(TG_(current_tid),
-					     TG_(current_state).bbcc->bb,
-					     ti->cur_inl_fns[i]);
-	    ti->cur_inl_depth = 0;
-	}
-	fn_node* to_fn = to->cxt->fn[0];
-	TG_(trace_emit_sample)(TG_(current_tid), True, to_fn);
-    }
-
-    /* To allow for above assertion we set context of next frame to 0 */
-    TG_ASSERT(TG_(current_call_stack).sp < TG_(current_call_stack).size);
-    current_entry++;
-    current_entry->cxt = 0;
-
-    if (!skip)
-	TG_(current_state).nonskipped = 0;
-    else if (!TG_(current_state).nonskipped) {
-	/* a call from nonskipped to skipped */
-	TG_(current_state).nonskipped = from;
-	if (!TG_(current_state).nonskipped->skipped) {
-	  TG_(init_cost_lz)( TG_(sets).full,
-			     &TG_(current_state).nonskipped->skipped);
-	  TG_(stat).distinct_skips++;
-	}
-    }
+   /* put jcc on call stack */
+   current_entry->jcc        = jcc;
+   current_entry->sp         = sp;
+   current_entry->ret_addr   = ret_addr;
+   current_entry->nonskipped = TG_(current_state).nonskipped;
+
+   TG_(current_call_stack).sp++;
+
+   /* Emit trace sample on function entry */
+   if (!skip && TG_(current_state).collect) {
+      /* Exit entire inline stack, deepest first */
+      thread_info* ti = TG_(get_current_thread)();
+      if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
+         for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
+            TG_(trace_emit_exit_inlined)
+         (TG_(current_tid), TG_(current_state).bbcc->bb, ti->cur_inl_fns[i]);
+         ti->cur_inl_depth = 0;
+      }
+      fn_node* to_fn = to->cxt->fn[0];
+      TG_(trace_emit_sample)(TG_(current_tid), True, to_fn);
+   }
+
+   /* To allow for above assertion we set context of next frame to 0 */
+   TG_ASSERT(TG_(current_call_stack).sp < TG_(current_call_stack).size);
+   current_entry++;
+   current_entry->cxt = 0;
+
+   if (!skip)
+      TG_(current_state).nonskipped = 0;
+   else if (!TG_(current_state).nonskipped) {
+      /* a call from nonskipped to skipped */
+      TG_(current_state).nonskipped = from;
+      if (!TG_(current_state).nonskipped->skipped) {
+         TG_(init_cost_lz)
+         (TG_(sets).full, &TG_(current_state).nonskipped->skipped);
+         TG_(stat).distinct_skips++;
+      }
+   }
 
 #if TG_ENABLE_DEBUG
-    TG_DEBUGIF(0) {
-	if (TG_(clo).verbose<2) {
-	  if (jcc && jcc->to && jcc->to->bb) {
-	    const HChar spaces[][41] = {
-                                  "   .   .   .   .   .   .   .   .   .   .",
-				  "  .   .   .   .   .   .   .   .   .   . ",
-				  " .   .   .   .   .   .   .   .   .   .  ",
-				  ".   .   .   .   .   .   .   .   .   .   " };
-
-	    int s = TG_(current_call_stack).sp;
-	    UInt* pars = (UInt*) sp;
-
-	    BB* bb = jcc->to->bb;
-	    if (s>40) s=40;
-	    VG_(printf)("%s> %s(0x%x, 0x%x, ...) [%s / %#lx]\n", spaces[s%4]+40-s, bb->fn->name,
-                        pars ? pars[1]:0,
-			pars ? pars[2]:0,
-			bb->obj->name + bb->obj->last_slash_pos,
-			(UWord)bb->offset);
-	  }
-	}
-	else if (TG_(clo).verbose<4) {
-	    VG_(printf)("+ %2d ", TG_(current_call_stack).sp);
-	    TG_(print_short_jcc)(jcc);
-	    VG_(printf)(", SP %#lx, RA %#lx\n", sp, ret_addr);
-	}
-	else {
-	    VG_(printf)("  Pushed ");
-	    TG_(print_stackentry)(3, TG_(current_call_stack).sp-1);
-	}
-    }
+   TG_DEBUGIF(0)
+   {
+      if (TG_(clo).verbose < 2) {
+         if (jcc && jcc->to && jcc->to->bb) {
+            const HChar spaces[][41] = {
+               "   .   .   .   .   .   .   .   .   .   .",
+               "  .   .   .   .   .   .   .   .   .   . ",
+               " .   .   .   .   .   .   .   .   .   .  ",
+               ".   .   .   .   .   .   .   .   .   .   "};
+
+            int   s    = TG_(current_call_stack).sp;
+            UInt* pars = (UInt*)sp;
+
+            BB* bb = jcc->to->bb;
+            if (s > 40)
+               s = 40;
+            VG_(printf)(
+               "%s> %s(0x%x, 0x%x, ...) [%s / %#lx]\n", spaces[s % 4] + 40 - s,
+               bb->fn->name, pars ? pars[1] : 0, pars ? pars[2] : 0,
+               bb->obj->name + bb->obj->last_slash_pos, (UWord)bb->offset);
+         }
+      } else if (TG_(clo).verbose < 4) {
+         VG_(printf)("+ %2d ", TG_(current_call_stack).sp);
+         TG_(print_short_jcc)(jcc);
+         VG_(printf)(", SP %#lx, RA %#lx\n", sp, ret_addr);
+      } else {
+         VG_(printf)("  Pushed ");
+         TG_(print_stackentry)(3, TG_(current_call_stack).sp - 1);
+      }
+   }
 #endif
-
 }
 
-
 /* Pop call stack and update inclusive sums.
  * Returns modified fcc.
  *
@@ -308,131 +295,126 @@ void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
  */
 void TG_(pop_call_stack)(void)
 {
-    jCC* jcc;
-    Int depth = 0;
-    call_entry* lower_entry;
-
-    if (TG_(current_state).sig >0) {
-	/* Check if we leave a signal handler; this can happen when
-	 * calling longjmp() in the handler */
-	TG_(run_post_signal_on_call_stack_bottom)();
-    }
-
-    lower_entry =
-	&(TG_(current_call_stack).entry[TG_(current_call_stack).sp-1]);
-
-    TG_DEBUG(4,"+ pop_call_stack: frame %d, jcc %p\n", 
-		TG_(current_call_stack).sp, lower_entry->jcc);
-
-    /* jCC item not any more on real stack: pop */
-    jcc = lower_entry->jcc;
-    TG_(current_state).nonskipped = lower_entry->nonskipped;
-
-    if (jcc) {
-	fn_node* to_fn  = jcc->to->cxt->fn[0];
-	UInt* pdepth =  TG_(get_fn_entry)(to_fn->number);
-	(*pdepth)--;
-	depth = *pdepth;
-
-	/* add cost difference to sum */
-	if ( TG_(add_diff_cost_lz)( TG_(sets).full, &(jcc->cost),
-				    lower_entry->enter_cost,
-				    TG_(current_state).cost) ) {
-	    
-	}
-	TG_(stat).ret_counter++;
-
-	/* Emit trace sample on function exit */
-	if (TG_(current_state).collect) {
-	    /* Exit entire inline stack, deepest first */
-	    thread_info* ti = TG_(get_current_thread)();
-	    if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
-		for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
-		    TG_(trace_emit_exit_inlined)(TG_(current_tid),
-						 TG_(current_state).bbcc->bb,
-						 ti->cur_inl_fns[i]);
-		ti->cur_inl_depth = 0;
-	    }
-	    TG_(trace_emit_sample)(TG_(current_tid), False, to_fn);
-	}
-
-	/* restore context */
-	TG_(current_state).cxt  = lower_entry->cxt;
-	TG_(current_fn_stack).top =
-	  TG_(current_fn_stack).bottom + lower_entry->fn_sp;
-	TG_ASSERT(TG_(current_state).cxt != 0);
-
-	if (depth == 0) function_left(to_fn);
-    }
-
-    /* To allow for an assertion in push_call_stack() */
-    lower_entry->cxt = 0;
-
-    TG_(current_call_stack).sp--;
+   jCC*        jcc;
+   Int         depth = 0;
+   call_entry* lower_entry;
+
+   if (TG_(current_state).sig > 0) {
+      /* Check if we leave a signal handler; this can happen when
+       * calling longjmp() in the handler */
+      TG_(run_post_signal_on_call_stack_bottom)();
+   }
+
+   lower_entry =
+      &(TG_(current_call_stack).entry[TG_(current_call_stack).sp - 1]);
+
+   TG_DEBUG(4, "+ pop_call_stack: frame %d, jcc %p\n",
+            TG_(current_call_stack).sp, lower_entry->jcc);
+
+   /* jCC item not any more on real stack: pop */
+   jcc                           = lower_entry->jcc;
+   TG_(current_state).nonskipped = lower_entry->nonskipped;
+
+   if (jcc) {
+      fn_node* to_fn  = jcc->to->cxt->fn[0];
+      UInt*    pdepth = TG_(get_fn_entry)(to_fn->number);
+      (*pdepth)--;
+      depth = *pdepth;
+
+      /* add cost difference to sum */
+      if (TG_(add_diff_cost_lz)(TG_(sets).full, &(jcc->cost),
+                                lower_entry->enter_cost,
+                                TG_(current_state).cost)) {
+      }
+      TG_(stat).ret_counter++;
+
+      /* Emit trace sample on function exit */
+      if (TG_(current_state).collect) {
+         /* Exit entire inline stack, deepest first */
+         thread_info* ti = TG_(get_current_thread)();
+         if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
+            for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
+               TG_(trace_emit_exit_inlined)
+            (TG_(current_tid), TG_(current_state).bbcc->bb, ti->cur_inl_fns[i]);
+            ti->cur_inl_depth = 0;
+         }
+         TG_(trace_emit_sample)(TG_(current_tid), False, to_fn);
+      }
+
+      /* restore context */
+      TG_(current_state).cxt = lower_entry->cxt;
+      TG_(current_fn_stack).top =
+         TG_(current_fn_stack).bottom + lower_entry->fn_sp;
+      TG_ASSERT(TG_(current_state).cxt != 0);
+
+      if (depth == 0)
+         function_left(to_fn);
+   }
+
+   /* To allow for an assertion in push_call_stack() */
+   lower_entry->cxt = 0;
+
+   TG_(current_call_stack).sp--;
 
 #if TG_ENABLE_DEBUG
-    TG_DEBUGIF(1) {
-	if (TG_(clo).verbose<4) {
-	    if (jcc) {
-		/* popped JCC target first */
-		VG_(printf)("- %2d %#lx => ",
-			    TG_(current_call_stack).sp,
-			    bb_addr(jcc->to->bb));
-		TG_(print_addr)(bb_jmpaddr(jcc->from->bb));
-		VG_(printf)(", SP %#lx\n",
-			    TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
-		TG_(print_cost)(10, TG_(sets).full, jcc->cost);
-	    }
-	    else
-		VG_(printf)("- %2d [Skipped JCC], SP %#lx\n",
-			    TG_(current_call_stack).sp,
-			    TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
-	}
-	else {
-	    VG_(printf)("  Popped ");
-	    TG_(print_stackentry)(7, TG_(current_call_stack).sp);
-	    if (jcc) {
-		VG_(printf)("       returned to ");
-		TG_(print_addr_ln)(bb_jmpaddr(jcc->from->bb));
-	    }
-	}
-    }
+   TG_DEBUGIF(1)
+   {
+      if (TG_(clo).verbose < 4) {
+         if (jcc) {
+            /* popped JCC target first */
+            VG_(printf)("- %2d %#lx => ", TG_(current_call_stack).sp,
+                        bb_addr(jcc->to->bb));
+            TG_(print_addr)(bb_jmpaddr(jcc->from->bb));
+            VG_(printf)(
+               ", SP %#lx\n",
+               TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
+            TG_(print_cost)(10, TG_(sets).full, jcc->cost);
+         } else
+            VG_(printf)(
+               "- %2d [Skipped JCC], SP %#lx\n", TG_(current_call_stack).sp,
+               TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
+      } else {
+         VG_(printf)("  Popped ");
+         TG_(print_stackentry)(7, TG_(current_call_stack).sp);
+         if (jcc) {
+            VG_(printf)("       returned to ");
+            TG_(print_addr_ln)(bb_jmpaddr(jcc->from->bb));
+         }
+      }
+   }
 #endif
-
 }
 
-
 /* Unwind enough CallStack items to sync with current stack pointer.
  * Returns the number of stack frames unwinded.
  */
 Int TG_(unwind_call_stack)(Addr sp, Int minpops)
 {
-    Int csp;
-    Int unwind_count = 0;
-    TG_DEBUG(4,"+ unwind_call_stack(sp %#lx, minpops %d): frame %d\n",
-	      sp, minpops, TG_(current_call_stack).sp);
-
-    /* We pop old stack frames.
-     * For a call, be p the stack address with return address.
-     *  - call_stack_esp[] has SP after the CALL: p-4
-     *  - current sp is after a RET: >= p
-     */
-    
-    while( (csp=TG_(current_call_stack).sp) >0) {
-	call_entry* top_ce = &(TG_(current_call_stack).entry[csp-1]);
-
-	if ((top_ce->sp < sp) ||
-	    ((top_ce->sp == sp) && minpops>0)) {
-
-	    minpops--;
-	    unwind_count++;
-	    TG_(pop_call_stack)();
-	    csp=TG_(current_call_stack).sp;
-	    continue;
-	}
-	break;
-    }
-
-    TG_DEBUG(4,"- unwind_call_stack\n");
-    return unwind_count;
+   Int csp;
+   Int unwind_count = 0;
+   TG_DEBUG(4, "+ unwind_call_stack(sp %#lx, minpops %d): frame %d\n", sp,
+            minpops, TG_(current_call_stack).sp);
+
+   /* We pop old stack frames.
+    * For a call, be p the stack address with return address.
+    *  - call_stack_esp[] has SP after the CALL: p-4
+    *  - current sp is after a RET: >= p
+    */
+
+   while ((csp = TG_(current_call_stack).sp) > 0) {
+      call_entry* top_ce = &(TG_(current_call_stack).entry[csp - 1]);
+
+      if ((top_ce->sp < sp) || ((top_ce->sp == sp) && minpops > 0)) {
+
+         minpops--;
+         unwind_count++;
+         TG_(pop_call_stack)();
+         csp = TG_(current_call_stack).sp;
+         continue;
+      }
+      break;
+   }
+
+   TG_DEBUG(4, "- unwind_call_stack\n");
+   return unwind_count;
 }
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
index 9662016fb..5bfa108fa 100644
--- a/tracegrind/clo.c
+++ b/tracegrind/clo.c
@@ -27,31 +27,29 @@
 
 #include "global.h"
 
-
-
 /*------------------------------------------------------------*/
 /*--- Function specific configuration options              ---*/
 /*------------------------------------------------------------*/
 
 /* Special value for separate_callers: automatic = adaptive */
-#define CONFIG_AUTO    -1
+#define CONFIG_AUTO -1
 
 #define CONFIG_DEFAULT -1
-#define CONFIG_FALSE    0
-#define CONFIG_TRUE     1
+#define CONFIG_FALSE   0
+#define CONFIG_TRUE    1
 
 /* Logging configuration for a function */
 struct _fn_config {
-    Int toggle_collect;
+   Int toggle_collect;
 
-    Int group;   /* don't change caller dependency inside group !=0 */
-    Int pop_on_jump; 
+   Int group; /* don't change caller dependency inside group !=0 */
+   Int pop_on_jump;
 
-    Int separate_callers;    /* separate logging dependent on caller  */
-    Int separate_recursions; /* separate logging of rec. levels       */
+   Int separate_callers;    /* separate logging dependent on caller  */
+   Int separate_recursions; /* separate logging of rec. levels       */
 
 #if TG_ENABLE_DEBUG
-    Int verbosity; /* Change debug verbosity level while in function */
+   Int verbosity; /* Change debug verbosity level while in function */
 #endif
 };
 
@@ -69,76 +67,69 @@ struct _fn_config {
  * go down the tree and apply all non-default configurations.
  */
 
-
 #define NODE_DEGREE 30
 
 /* node of compressed trie search structure */
 typedef struct _config_node config_node;
 struct _config_node {
-  Int length;
-    
-  fn_config* config;
-  config_node* sub_node[NODE_DEGREE];
-  config_node* next;
-  config_node* wild_star;
-  config_node* wild_char;
-
-  HChar name[1];
+   Int length;
+
+   fn_config*   config;
+   config_node* sub_node[NODE_DEGREE];
+   config_node* next;
+   config_node* wild_star;
+   config_node* wild_char;
+
+   HChar name[1];
 };
 
 /* root of trie */
 static config_node* fn_configs = 0;
 
-static __inline__ 
-fn_config* new_fnc(void)
+static __inline__ fn_config* new_fnc(void)
 {
-   fn_config* fnc = (fn_config*) TG_MALLOC("cl.clo.nf.1",
-                                            sizeof(fn_config));
+   fn_config* fnc = (fn_config*)TG_MALLOC("cl.clo.nf.1", sizeof(fn_config));
 
-   fnc->toggle_collect = CONFIG_DEFAULT;
-   fnc->pop_on_jump  = CONFIG_DEFAULT;
-   fnc->group        = CONFIG_DEFAULT;
+   fnc->toggle_collect      = CONFIG_DEFAULT;
+   fnc->pop_on_jump         = CONFIG_DEFAULT;
+   fnc->group               = CONFIG_DEFAULT;
    fnc->separate_callers    = CONFIG_DEFAULT;
    fnc->separate_recursions = CONFIG_DEFAULT;
 
 #if TG_ENABLE_DEBUG
-   fnc->verbosity    = CONFIG_DEFAULT;
+   fnc->verbosity = CONFIG_DEFAULT;
 #endif
 
    return fnc;
 }
 
-
 static config_node* new_config(const HChar* name, int length)
 {
-    int i;
-    config_node* node = (config_node*) TG_MALLOC("cl.clo.nc.1",
-                                                  sizeof(config_node) + length);
+   int          i;
+   config_node* node =
+      (config_node*)TG_MALLOC("cl.clo.nc.1", sizeof(config_node) + length);
 
-    for(i=0;i<length;i++) {
-      if (name[i] == 0) break;
+   for (i = 0; i < length; i++) {
+      if (name[i] == 0)
+         break;
       node->name[i] = name[i];
-    }
-    node->name[i] = 0;
+   }
+   node->name[i] = 0;
 
-    node->length = length;
-    node->config = 0;
-    for(i=0;i<NODE_DEGREE;i++)
-	node->sub_node[i] = 0;
-    node->next = 0;
-    node->wild_char = 0;
-    node->wild_star = 0;
+   node->length = length;
+   node->config = 0;
+   for (i = 0; i < NODE_DEGREE; i++)
+      node->sub_node[i] = 0;
+   node->next      = 0;
+   node->wild_char = 0;
+   node->wild_star = 0;
 
-    TG_DEBUG(3, "   new_config('%s', len %d)\n", node->name, length);
+   TG_DEBUG(3, "   new_config('%s', len %d)\n", node->name, length);
 
-    return node;
+   return node;
 }
 
-static __inline__
-Bool is_wild(HChar n)
-{
-  return (n == '*') || (n == '?');
-}
+static __inline__ Bool is_wild(HChar n) { return (n == '*') || (n == '?'); }
 
 /* Recursively build up function matching tree (prefix tree).
  * Returns function config object for pattern <name>
@@ -150,181 +141,193 @@ Bool is_wild(HChar n)
  */
 static fn_config* get_fnc2(config_node* node, const HChar* name)
 {
-  config_node *new_sub, *n, *nprev;
-  int offset, len;
-
-  TG_DEBUG(3, "  get_fnc2(%p, '%s')\n", node, name);
-
-  if (name[0] == 0) {
-    if (!node->config) node->config = new_fnc();
-    return node->config;
-  }
-
-  if (is_wild(*name)) {
-    if (*name == '*') {
-      while(name[1] == '*') name++;
-      new_sub = node->wild_star;
-    }
-    else
-      new_sub = node->wild_char;
-
-    if (!new_sub) {
-      new_sub = new_config(name, 1);
-      if (*name == '*')
-	node->wild_star = new_sub;
+   config_node *new_sub, *n, *nprev;
+   int          offset, len;
+
+   TG_DEBUG(3, "  get_fnc2(%p, '%s')\n", node, name);
+
+   if (name[0] == 0) {
+      if (!node->config)
+         node->config = new_fnc();
+      return node->config;
+   }
+
+   if (is_wild(*name)) {
+      if (*name == '*') {
+         while (name[1] == '*')
+            name++;
+         new_sub = node->wild_star;
+      } else
+         new_sub = node->wild_char;
+
+      if (!new_sub) {
+         new_sub = new_config(name, 1);
+         if (*name == '*')
+            node->wild_star = new_sub;
+         else
+            node->wild_char = new_sub;
+      }
+
+      return get_fnc2(new_sub, name + 1);
+   }
+
+   n     = node->sub_node[name[0] % NODE_DEGREE];
+   nprev = 0;
+   len   = 0;
+   while (n) {
+      for (len = 0; name[len] == n->name[len]; len++)
+         ;
+      if (len > 0)
+         break;
+      nprev = n;
+      n     = n->next;
+   }
+
+   if (!n) {
+      len = 1;
+      while (name[len] && (!is_wild(name[len])))
+         len++;
+      new_sub       = new_config(name, len);
+      new_sub->next = node->sub_node[name[0] % NODE_DEGREE];
+      node->sub_node[name[0] % NODE_DEGREE] = new_sub;
+
+      if (name[len] == 0) {
+         new_sub->config = new_fnc();
+         return new_sub->config;
+      }
+
+      /* recurse on wildcard */
+      return get_fnc2(new_sub, name + len);
+   }
+
+   if (len < n->length) {
+
+      /* split up the subnode <n> */
+      config_node* new_node;
+      int          i;
+
+      new_node = new_config(n->name, len);
+      if (nprev)
+         nprev->next = new_node;
       else
-	node->wild_char = new_sub;
-    }
-
-    return get_fnc2( new_sub, name+1);
-  }
-
-  n = node->sub_node[ name[0]%NODE_DEGREE ];
-  nprev = 0;
-  len = 0;
-  while(n) {
-    for(len=0; name[len] == n->name[len]; len++);
-    if (len>0) break;
-    nprev = n;
-    n = n->next;
-  }
-
-  if (!n) {
-    len = 1;
-    while(name[len] && (!is_wild(name[len]))) len++;
-    new_sub = new_config(name, len);
-    new_sub->next = node->sub_node[ name[0]%NODE_DEGREE ];
-    node->sub_node[ name[0]%NODE_DEGREE ] = new_sub;	
-
-    if (name[len] == 0) {
-      new_sub->config = new_fnc();
-      return new_sub->config;
-    }
-    
-    /* recurse on wildcard */
-    return get_fnc2( new_sub, name+len);
-  }
-
-  if (len < n->length) {
-
-    /* split up the subnode <n> */
-    config_node *new_node;
-    int i;
-
-    new_node = new_config(n->name, len);
-    if (nprev)
-      nprev->next = new_node;
-    else
-      node->sub_node[ n->name[0]%NODE_DEGREE ] = new_node;
-    new_node->next = n->next;
-
-    new_node->sub_node[ n->name[len]%NODE_DEGREE ] = n;
-
-    for(i=0, offset=len; offset < n->length; i++, offset++)
-      n->name[i] = n->name[offset];
-    n->name[i] = 0;
-    n->length = i;
-
-    name += len;
-    offset = 0;
-    while(name[offset] && (!is_wild(name[offset]))) offset++;
-    new_sub  = new_config(name, offset);
-    /* this sub_node of new_node could already be set: chain! */
-    new_sub->next = new_node->sub_node[ name[0]%NODE_DEGREE ];
-    new_node->sub_node[ name[0]%NODE_DEGREE ] = new_sub;
-
-    if (name[offset]==0) {
-      new_sub->config = new_fnc();
-      return new_sub->config;
-    }
-
-    /* recurse on wildcard */
-    return get_fnc2( new_sub, name+offset);
-  }
-
-  name += n->length;
-
-  if (name[0] == 0) {
-    /* name and node name are the same */
-    if (!n->config) n->config = new_fnc();
-    return n->config;
-  }
-
-  offset = 1;
-  while(name[offset] && (!is_wild(name[offset]))) offset++;
-
-  new_sub = new_config(name, offset);
-  new_sub->next = n->sub_node[ name[0]%NODE_DEGREE ];
-  n->sub_node[ name[0]%NODE_DEGREE ] = new_sub;
-
-  return get_fnc2(new_sub, name+offset);
+         node->sub_node[n->name[0] % NODE_DEGREE] = new_node;
+      new_node->next = n->next;
+
+      new_node->sub_node[n->name[len] % NODE_DEGREE] = n;
+
+      for (i = 0, offset = len; offset < n->length; i++, offset++)
+         n->name[i] = n->name[offset];
+      n->name[i] = 0;
+      n->length  = i;
+
+      name += len;
+      offset = 0;
+      while (name[offset] && (!is_wild(name[offset])))
+         offset++;
+      new_sub = new_config(name, offset);
+      /* this sub_node of new_node could already be set: chain! */
+      new_sub->next = new_node->sub_node[name[0] % NODE_DEGREE];
+      new_node->sub_node[name[0] % NODE_DEGREE] = new_sub;
+
+      if (name[offset] == 0) {
+         new_sub->config = new_fnc();
+         return new_sub->config;
+      }
+
+      /* recurse on wildcard */
+      return get_fnc2(new_sub, name + offset);
+   }
+
+   name += n->length;
+
+   if (name[0] == 0) {
+      /* name and node name are the same */
+      if (!n->config)
+         n->config = new_fnc();
+      return n->config;
+   }
+
+   offset = 1;
+   while (name[offset] && (!is_wild(name[offset])))
+      offset++;
+
+   new_sub                            = new_config(name, offset);
+   new_sub->next                      = n->sub_node[name[0] % NODE_DEGREE];
+   n->sub_node[name[0] % NODE_DEGREE] = new_sub;
+
+   return get_fnc2(new_sub, name + offset);
 }
 
 static void print_config_node(int depth, int hash, config_node* node)
 {
-  config_node* n;
-  int i;
-
-  if (node != fn_configs) {
-    const HChar sp[] = "                                        ";
-
-    if (depth>40) depth=40;
-    VG_(printf)("%s", sp+40-depth);
-    if (hash >=0) VG_(printf)(" [hash %2d]", hash);
-    else if (hash == -2) VG_(printf)(" [wildc ?]");
-    else if (hash == -3) VG_(printf)(" [wildc *]");
-    VG_(printf)(" '%s' (len %d)\n", node->name, node->length);
-  }
-  for(i=0;i<NODE_DEGREE;i++) {
-    n = node->sub_node[i];
-    while(n) {
-      print_config_node(depth+1, i, n);
-      n = n->next;
-    }
-  }
-  if (node->wild_char) print_config_node(depth+1, -2, node->wild_char);
-  if (node->wild_star) print_config_node(depth+1, -3, node->wild_star);
+   config_node* n;
+   int          i;
+
+   if (node != fn_configs) {
+      const HChar sp[] = "                                        ";
+
+      if (depth > 40)
+         depth = 40;
+      VG_(printf)("%s", sp + 40 - depth);
+      if (hash >= 0)
+         VG_(printf)(" [hash %2d]", hash);
+      else if (hash == -2)
+         VG_(printf)(" [wildc ?]");
+      else if (hash == -3)
+         VG_(printf)(" [wildc *]");
+      VG_(printf)(" '%s' (len %d)\n", node->name, node->length);
+   }
+   for (i = 0; i < NODE_DEGREE; i++) {
+      n = node->sub_node[i];
+      while (n) {
+         print_config_node(depth + 1, i, n);
+         n = n->next;
+      }
+   }
+   if (node->wild_char)
+      print_config_node(depth + 1, -2, node->wild_char);
+   if (node->wild_star)
+      print_config_node(depth + 1, -3, node->wild_star);
 }
 
 /* get a function config for a name pattern (from command line) */
 static fn_config* get_fnc(const HChar* name)
 {
-  fn_config* fnc;
-
-  TG_DEBUG(3, " +get_fnc(%s)\n", name);
-  if (fn_configs == 0)
-    fn_configs = new_config(name, 0);
-  fnc =  get_fnc2(fn_configs, name);
-
-  TG_DEBUGIF(3) {
-    TG_DEBUG(3, " -get_fnc(%s):\n", name);
-    print_config_node(3, -1, fn_configs);
-  }
-  return fnc;
-}
+   fn_config* fnc;
+
+   TG_DEBUG(3, " +get_fnc(%s)\n", name);
+   if (fn_configs == 0)
+      fn_configs = new_config(name, 0);
+   fnc = get_fnc2(fn_configs, name);
 
-  
+   TG_DEBUGIF(3)
+   {
+      TG_DEBUG(3, " -get_fnc(%s):\n", name);
+      print_config_node(3, -1, fn_configs);
+   }
+   return fnc;
+}
 
 static void update_fn_config1(fn_node* fn, fn_config* fnc)
 {
-    if (fnc->toggle_collect != CONFIG_DEFAULT)
-	fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
+   if (fnc->toggle_collect != CONFIG_DEFAULT)
+      fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
 
-    if (fnc->pop_on_jump != CONFIG_DEFAULT)
-	fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE);
+   if (fnc->pop_on_jump != CONFIG_DEFAULT)
+      fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE);
 
-    if (fnc->group != CONFIG_DEFAULT)
-	fn->group = fnc->group;
+   if (fnc->group != CONFIG_DEFAULT)
+      fn->group = fnc->group;
 
-    if (fnc->separate_callers != CONFIG_DEFAULT)
-	fn->separate_callers = fnc->separate_callers;
+   if (fnc->separate_callers != CONFIG_DEFAULT)
+      fn->separate_callers = fnc->separate_callers;
 
-    if (fnc->separate_recursions != CONFIG_DEFAULT)
-	fn->separate_recursions = fnc->separate_recursions;
+   if (fnc->separate_recursions != CONFIG_DEFAULT)
+      fn->separate_recursions = fnc->separate_recursions;
 
 #if TG_ENABLE_DEBUG
-    if (fnc->verbosity != CONFIG_DEFAULT)
-	fn->verbosity = fnc->verbosity;
+   if (fnc->verbosity != CONFIG_DEFAULT)
+      fn->verbosity = fnc->verbosity;
 #endif
 }
 
@@ -332,54 +335,52 @@ static void update_fn_config1(fn_node* fn, fn_config* fnc)
  * looking for a match to <name>. For every matching leaf,
  * <fn> is updated with the pattern config.
  */
-static void update_fn_config2(fn_node* fn, const HChar* name,
-                              config_node* node)
+static void update_fn_config2(fn_node* fn, const HChar* name, config_node* node)
 {
-    config_node* n;
+   config_node* n;
 
-    TG_DEBUG(3, "  update_fn_config2('%s', node '%s'): \n",
-	     name, node->name);
-    if ((*name == 0) && node->config) {
+   TG_DEBUG(3, "  update_fn_config2('%s', node '%s'): \n", name, node->name);
+   if ((*name == 0) && node->config) {
       TG_DEBUG(3, "   found!\n");
       update_fn_config1(fn, node->config);
       return;
-    }
+   }
 
-    n = node->sub_node[ name[0]%NODE_DEGREE ];
-    while(n) {
-      if (VG_(strncmp)(name, n->name, n->length)==0) break;
+   n = node->sub_node[name[0] % NODE_DEGREE];
+   while (n) {
+      if (VG_(strncmp)(name, n->name, n->length) == 0)
+         break;
       n = n->next;
-    }
-    if (n) {
-	TG_DEBUG(3, "   '%s' matching at hash %d\n",
-		  n->name, name[0]%NODE_DEGREE);
-	update_fn_config2(fn, name+n->length, n);
-    }
-    
-    if (node->wild_char) {
-	TG_DEBUG(3, "   skip '%c' for wildcard '?'\n", *name);
-	update_fn_config2(fn, name+1, node->wild_char);
-    }
-
-    if (node->wild_star) {
+   }
+   if (n) {
+      TG_DEBUG(3, "   '%s' matching at hash %d\n", n->name,
+               name[0] % NODE_DEGREE);
+      update_fn_config2(fn, name + n->length, n);
+   }
+
+   if (node->wild_char) {
+      TG_DEBUG(3, "   skip '%c' for wildcard '?'\n", *name);
+      update_fn_config2(fn, name + 1, node->wild_char);
+   }
+
+   if (node->wild_star) {
       TG_DEBUG(3, "   wildcard '*'\n");
-      while(*name) {
-	update_fn_config2(fn, name, node->wild_star);
-	name++;
+      while (*name) {
+         update_fn_config2(fn, name, node->wild_star);
+         name++;
       }
       update_fn_config2(fn, name, node->wild_star);
-    }
+   }
 }
 
 /* Update function config according to configs of name prefixes */
 void TG_(update_fn_config)(fn_node* fn)
 {
-    TG_DEBUG(3, "  update_fn_config('%s')\n", fn->name);
-    if (fn_configs)
+   TG_DEBUG(3, "  update_fn_config('%s')\n", fn->name);
+   if (fn_configs)
       update_fn_config2(fn, fn->name, fn_configs);
 }
 
-
 /*--------------------------------------------------------------------*/
 /*--- Command line processing                                      ---*/
 /*--------------------------------------------------------------------*/
@@ -388,114 +389,130 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 {
    const HChar* tmp_str;
 
-   if      VG_BOOL_CLO(arg, "--skip-plt", TG_(clo).skip_plt) {}
+   if VG_BOOL_CLO (arg, "--skip-plt", TG_(clo).skip_plt) {
+   }
 
-   else if VG_BOOL_CLO(arg, "--collect-jumps", TG_(clo).collect_jumps) {}
+   else if VG_BOOL_CLO (arg, "--collect-jumps", TG_(clo).collect_jumps) {
+   }
    /* compatibility alias, deprecated option */
-   else if VG_BOOL_CLO(arg, "--trace-jump",    TG_(clo).collect_jumps) {}
+   else if VG_BOOL_CLO (arg, "--trace-jump", TG_(clo).collect_jumps) {
+   }
 
-   else if VG_BOOL_CLO(arg, "--collect-atstart", TG_(clo).collect_atstart) {}
+   else if VG_BOOL_CLO (arg, "--collect-atstart", TG_(clo).collect_atstart) {
+   }
 
-   else if VG_BOOL_CLO(arg, "--instr-atstart", TG_(clo).instrument_atstart) {}
+   else if VG_BOOL_CLO (arg, "--instr-atstart", TG_(clo).instrument_atstart) {
+   }
 
-   else if VG_BOOL_CLO(arg, "--separate-threads", TG_(clo).separate_threads) {}
+   else if VG_BOOL_CLO (arg, "--separate-threads", TG_(clo).separate_threads) {
+   }
 
-   else if VG_STR_CLO(arg, "--toggle-collect", tmp_str) {
-       fn_config* fnc = get_fnc(tmp_str);
-       fnc->toggle_collect = CONFIG_TRUE;
-       /* defaults to initial collection off */
-       TG_(clo).collect_atstart = False;
+   else if VG_STR_CLO (arg, "--toggle-collect", tmp_str) {
+      fn_config* fnc      = get_fnc(tmp_str);
+      fnc->toggle_collect = CONFIG_TRUE;
+      /* defaults to initial collection off */
+      TG_(clo).collect_atstart = False;
    }
 
-   else if VG_INT_CLO(arg, "--separate-recs", TG_(clo).separate_recursions) {}
+   else if VG_INT_CLO (arg, "--separate-recs", TG_(clo).separate_recursions) {
+   }
 
    /* change handling of a jump between functions to ret+call */
-   else if VG_XACT_CLO(arg, "--pop-on-jump", TG_(clo).pop_on_jump, True) {}
-   else if VG_STR_CLO( arg, "--pop-on-jump", tmp_str) {
-       fn_config* fnc = get_fnc(tmp_str);
-       fnc->pop_on_jump = CONFIG_TRUE;
+   else if VG_XACT_CLO (arg, "--pop-on-jump", TG_(clo).pop_on_jump, True) {
+   } else if VG_STR_CLO (arg, "--pop-on-jump", tmp_str) {
+      fn_config* fnc   = get_fnc(tmp_str);
+      fnc->pop_on_jump = CONFIG_TRUE;
    }
 
 #if TG_ENABLE_DEBUG
-   else if VG_INT_CLO(arg, "--ct-verbose", TG_(clo).verbose) {}
-   else if VG_INT_CLO(arg, "--ct-vstart",  TG_(clo).verbose_start) {}
+   else if VG_INT_CLO (arg, "--ct-verbose", TG_(clo).verbose) {
+   } else if VG_INT_CLO (arg, "--ct-vstart", TG_(clo).verbose_start) {
+   }
 
-   else if VG_STREQN(12, arg, "--ct-verbose") {
-       fn_config* fnc;
-       HChar* s;
-       UInt n = VG_(strtoll10)(arg+12, &s);
-       if ((n <= 0) || *s != '=') return False;
-       fnc = get_fnc(s+1);
-       fnc->verbosity = n;
+   else if VG_STREQN (12, arg, "--ct-verbose") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 12, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc            = get_fnc(s + 1);
+      fnc->verbosity = n;
    }
 #endif
 
-   else if VG_XACT_CLO(arg, "--separate-callers=auto", 
-                            TG_(clo).separate_callers, CONFIG_AUTO) {}
-   else if VG_INT_CLO( arg, "--separate-callers", 
-                            TG_(clo).separate_callers) {}
-
-   else if VG_STREQN(10, arg, "--fn-group") {
-       fn_config* fnc;
-       HChar* s;
-       UInt n = VG_(strtoll10)(arg+10, &s);
-       if ((n <= 0) || *s != '=') return False;
-       fnc = get_fnc(s+1);
-       fnc->group = n;
-   }
-
-   else if VG_STREQN(18, arg, "--separate-callers") {
-       fn_config* fnc;
-       HChar* s;
-       UInt n = VG_(strtoll10)(arg+18, &s);
-       if ((n <= 0) || *s != '=') return False;
-       fnc = get_fnc(s+1);
-       fnc->separate_callers = n;
-   }
-
-   else if VG_STREQN(15, arg, "--separate-recs") {
-       fn_config* fnc;
-       HChar* s;
-       UInt n = VG_(strtoll10)(arg+15, &s);
-       if ((n <= 0) || *s != '=') return False;
-       fnc = get_fnc(s+1);
-       fnc->separate_recursions = n;
-   }
-
-   else if VG_STR_CLO(arg, "--tracegrind-out-file", TG_(clo).out_format) {}
-
-   else if VG_XACT_CLO(arg, "--collect-systime=no",
-                       TG_(clo).collect_systime, systime_no) {}
-   else if VG_XACT_CLO(arg, "--collect-systime=msec",
-                       TG_(clo).collect_systime, systime_msec) {}
-   else if VG_XACT_CLO(arg, "--collect-systime=yes", /* backward compatibility.  */
-                       TG_(clo).collect_systime, systime_msec) {}
-   else if VG_XACT_CLO(arg, "--collect-systime=usec",
-                       TG_(clo).collect_systime, systime_usec) {}
-   else if VG_XACT_CLO(arg, "--collect-systime=nsec",
-                       TG_(clo).collect_systime, systime_nsec) {
-#  if defined(VGO_darwin)
+   else if VG_XACT_CLO (arg, "--separate-callers=auto",
+                        TG_(clo).separate_callers, CONFIG_AUTO) {
+   } else if VG_INT_CLO (arg, "--separate-callers", TG_(clo).separate_callers) {
+   }
+
+   else if VG_STREQN (10, arg, "--fn-group") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 10, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc        = get_fnc(s + 1);
+      fnc->group = n;
+   }
+
+   else if VG_STREQN (18, arg, "--separate-callers") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 18, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc                   = get_fnc(s + 1);
+      fnc->separate_callers = n;
+   }
+
+   else if VG_STREQN (15, arg, "--separate-recs") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 15, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc                      = get_fnc(s + 1);
+      fnc->separate_recursions = n;
+   }
+
+   else if VG_STR_CLO (arg, "--tracegrind-out-file", TG_(clo).out_format) {
+   }
+
+   else if VG_XACT_CLO (arg, "--collect-systime=no", TG_(clo).collect_systime,
+                        systime_no) {
+   } else if VG_XACT_CLO (arg, "--collect-systime=msec",
+                          TG_(clo).collect_systime, systime_msec) {
+   } else if VG_XACT_CLO (arg,
+                          "--collect-systime=yes", /* backward compatibility. */
+                          TG_(clo).collect_systime, systime_msec) {
+   } else if VG_XACT_CLO (arg, "--collect-systime=usec",
+                          TG_(clo).collect_systime, systime_usec) {
+   } else if VG_XACT_CLO (arg, "--collect-systime=nsec",
+                          TG_(clo).collect_systime, systime_nsec) {
+#if defined(VGO_darwin)
       VG_(fmsg_bad_option)
-         (arg,
-          "--collect-systime=nsec not supported on darwin\n");
-#  endif
+      (arg, "--collect-systime=nsec not supported on darwin\n");
+#endif
    }
 
-   else if VG_BOOL_CLO(arg, "--collect-bus",     TG_(clo).collect_bus) {}
+   else if VG_BOOL_CLO (arg, "--collect-bus", TG_(clo).collect_bus) {
+   }
    /* for option compatibility with cachegrind */
-   else if VG_BOOL_CLO(arg, "--cache-sim",       TG_(clo).simulate_cache) {}
+   else if VG_BOOL_CLO (arg, "--cache-sim", TG_(clo).simulate_cache) {
+   }
    /* compatibility alias, deprecated option */
-   else if VG_BOOL_CLO(arg, "--simulate-cache",  TG_(clo).simulate_cache) {}
+   else if VG_BOOL_CLO (arg, "--simulate-cache", TG_(clo).simulate_cache) {
+   }
    /* for option compatibility with cachegrind */
-   else if VG_BOOL_CLO(arg, "--branch-sim",      TG_(clo).simulate_branch) {}
-   else {
-       Bool isCachesimOption = (*TG_(cachesim).parse_opt)(arg);
+   else if VG_BOOL_CLO (arg, "--branch-sim", TG_(clo).simulate_branch) {
+   } else {
+      Bool isCachesimOption = (*TG_(cachesim).parse_opt)(arg);
 
-       /* cache simulator is used if a simulator option is given */
-       if (isCachesimOption)
-	   TG_(clo).simulate_cache = True;
+      /* cache simulator is used if a simulator option is given */
+      if (isCachesimOption)
+         TG_(clo).simulate_cache = True;
 
-       return isCachesimOption;
+      return isCachesimOption;
    }
 
    return True;
@@ -504,86 +521,93 @@ Bool TG_(process_cmd_line_option)(const HChar* arg)
 void TG_(print_usage)(void)
 {
    VG_(printf)(
-"\n   output options:\n"
-"    --tracegrind-out-file=<f>  Output file name [tracegrind.out.%%p.msgpack.lz4]\n"
-
-"\n   data collection options:\n"
-"    --instr-atstart=no|yes    Do instrumentation at tracegrind start [yes]\n"
-"    --collect-atstart=no|yes  Collect at process/thread start [yes]\n"
-"    --toggle-collect=<func>   Toggle collection on enter/leave function\n"
-"    --collect-jumps=no|yes    Collect jumps? [no]\n"
-"    --collect-bus=no|yes      Collect global bus events? [no]\n"
-"    --collect-systime=no|yes|msec|usec|nsec  Collect system call time info? [no]\n"
-"        no         Do not collect system call time info.\n"
-"        msec|yes   Collect syscount, syscall elapsed time (milli-seconds).\n"
-"        usec       Collect syscount, syscall elapsed time (micro-seconds).\n"
-"        nsec       Collect syscount, syscall elapsed and syscall cpu time (nano-seconds).\n"
-
-"\n   cost entity separation options:\n"
-"    --separate-threads=no|yes Separate data per thread [no]\n"
-"    --separate-callers=<n>    Separate functions by call chain length [0]\n"
-"    --separate-callers<n>=<f> Separate <n> callers for function <f>\n"
-"    --separate-recs=<n>       Separate function recursions up to level [2]\n"
-"    --separate-recs<n>=<f>    Separate <n> recursions for function <f>\n"
-"    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
+      "\n   output options:\n"
+      "    --tracegrind-out-file=<f>  Output file name "
+      "[tracegrind.out.%%p.msgpack.lz4]\n"
+
+      "\n   data collection options:\n"
+      "    --instr-atstart=no|yes    Do instrumentation at tracegrind start "
+      "[yes]\n"
+      "    --collect-atstart=no|yes  Collect at process/thread start [yes]\n"
+      "    --toggle-collect=<func>   Toggle collection on enter/leave "
+      "function\n"
+      "    --collect-jumps=no|yes    Collect jumps? [no]\n"
+      "    --collect-bus=no|yes      Collect global bus events? [no]\n"
+      "    --collect-systime=no|yes|msec|usec|nsec  Collect system call time "
+      "info? [no]\n"
+      "        no         Do not collect system call time info.\n"
+      "        msec|yes   Collect syscount, syscall elapsed time "
+      "(milli-seconds).\n"
+      "        usec       Collect syscount, syscall elapsed time "
+      "(micro-seconds).\n"
+      "        nsec       Collect syscount, syscall elapsed and syscall cpu "
+      "time (nano-seconds).\n"
+
+      "\n   cost entity separation options:\n"
+      "    --separate-threads=no|yes Separate data per thread [no]\n"
+      "    --separate-callers=<n>    Separate functions by call chain length "
+      "[0]\n"
+      "    --separate-callers<n>=<f> Separate <n> callers for function <f>\n"
+      "    --separate-recs=<n>       Separate function recursions up to level "
+      "[2]\n"
+      "    --separate-recs<n>=<f>    Separate <n> recursions for function <f>\n"
+      "    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
 #if TG_EXPERIMENTAL
-"    --fn-group<no>=<func>     Put function into separation group <no>\n"
+      "    --fn-group<no>=<func>     Put function into separation group <no>\n"
 #endif
-"\n   simulation options:\n"
-"    --branch-sim=no|yes       Do branch prediction simulation [no]\n"
-"    --cache-sim=no|yes        Do cache simulation [no]\n"
-    );
+      "\n   simulation options:\n"
+      "    --branch-sim=no|yes       Do branch prediction simulation [no]\n"
+      "    --cache-sim=no|yes        Do cache simulation [no]\n");
 
    (*TG_(cachesim).print_opts)();
 
-//   VG_(printf)("\n"
-//	       "  For full tracegrind documentation, see\n"
-//	       "  "VG_PREFIX"/share/doc/tracegrind/html/tracegrind.html\n\n");
+   //   VG_(printf)("\n"
+   //	       "  For full tracegrind documentation, see\n"
+   //	       "  "VG_PREFIX"/share/doc/tracegrind/html/tracegrind.html\n\n");
 }
 
 void TG_(print_debug_usage)(void)
 {
-    VG_(printf)(
+   VG_(printf)(
 
 #if TG_ENABLE_DEBUG
-"    --ct-verbose=<level>       Verbosity of standard debug output [0]\n"
-"    --ct-vstart=<BB number>    Only be verbose after basic block [0]\n"
-"    --ct-verbose<level>=<func> Verbosity while in <func>\n"
+      "    --ct-verbose=<level>       Verbosity of standard debug output [0]\n"
+      "    --ct-vstart=<BB number>    Only be verbose after basic block [0]\n"
+      "    --ct-verbose<level>=<func> Verbosity while in <func>\n"
 #else
-"    (none)\n"
+      "    (none)\n"
 #endif
 
-    );
+   );
 }
 
-
 void TG_(set_clo_defaults)(void)
 {
-  /* Default values for command line arguments */
+   /* Default values for command line arguments */
 
-  /* Output */
-  TG_(clo).out_format       = 0;
+   /* Output */
+   TG_(clo).out_format = 0;
 
-  /* Collection */
-  TG_(clo).separate_threads = False;
-  TG_(clo).collect_atstart  = True;
-  TG_(clo).collect_jumps    = False;
-  TG_(clo).collect_systime  = systime_no;
-  TG_(clo).collect_bus      = False;
+   /* Collection */
+   TG_(clo).separate_threads = False;
+   TG_(clo).collect_atstart  = True;
+   TG_(clo).collect_jumps    = False;
+   TG_(clo).collect_systime  = systime_no;
+   TG_(clo).collect_bus      = False;
 
-  TG_(clo).skip_plt         = True;
-  TG_(clo).separate_callers = 0;
-  TG_(clo).separate_recursions = 2;
-  /* Instrumentation */
-  TG_(clo).instrument_atstart = True;
-  TG_(clo).simulate_cache = False;
-  TG_(clo).simulate_branch = False;
+   TG_(clo).skip_plt            = True;
+   TG_(clo).separate_callers    = 0;
+   TG_(clo).separate_recursions = 2;
+   /* Instrumentation */
+   TG_(clo).instrument_atstart = True;
+   TG_(clo).simulate_cache     = False;
+   TG_(clo).simulate_branch    = False;
 
-  /* Call graph */
-  TG_(clo).pop_on_jump = False;
+   /* Call graph */
+   TG_(clo).pop_on_jump = False;
 
 #if TG_ENABLE_DEBUG
-  TG_(clo).verbose = 0;
-  TG_(clo).verbose_start = 0;
+   TG_(clo).verbose       = 0;
+   TG_(clo).verbose_start = 0;
 #endif
 }
diff --git a/tracegrind/context.c b/tracegrind/context.c
index e80234891..44fc16331 100644
--- a/tracegrind/context.c
+++ b/tracegrind/context.c
@@ -26,43 +26,42 @@
 
 #include "global.h"
 
-
 /*------------------------------------------------------------*/
 /*--- Context operations                                   ---*/
 /*------------------------------------------------------------*/
 
 #define N_FNSTACK_INITIAL_ENTRIES 500
-#define N_CXT_INITIAL_ENTRIES 2537
+#define N_CXT_INITIAL_ENTRIES     2537
 
 fn_stack TG_(current_fn_stack);
 
 void TG_(init_fn_stack)(fn_stack* s)
 {
-  TG_ASSERT(s != 0);
+   TG_ASSERT(s != 0);
 
-  s->size   = N_FNSTACK_INITIAL_ENTRIES;   
-  s->bottom = (fn_node**) TG_MALLOC("cl.context.ifs.1",
-                                     s->size * sizeof(fn_node*));
-  s->top    = s->bottom;
-  s->bottom[0] = 0;
+   s->size = N_FNSTACK_INITIAL_ENTRIES;
+   s->bottom =
+      (fn_node**)TG_MALLOC("cl.context.ifs.1", s->size * sizeof(fn_node*));
+   s->top       = s->bottom;
+   s->bottom[0] = 0;
 }
 
 void TG_(copy_current_fn_stack)(fn_stack* dst)
 {
-  TG_ASSERT(dst != 0);
+   TG_ASSERT(dst != 0);
 
-  dst->size   = TG_(current_fn_stack).size;
-  dst->bottom = TG_(current_fn_stack).bottom;
-  dst->top    = TG_(current_fn_stack).top;
+   dst->size   = TG_(current_fn_stack).size;
+   dst->bottom = TG_(current_fn_stack).bottom;
+   dst->top    = TG_(current_fn_stack).top;
 }
 
 void TG_(set_current_fn_stack)(fn_stack* s)
 {
-  TG_ASSERT(s != 0);
+   TG_ASSERT(s != 0);
 
-  TG_(current_fn_stack).size   = s->size;
-  TG_(current_fn_stack).bottom = s->bottom;
-  TG_(current_fn_stack).top    = s->top;
+   TG_(current_fn_stack).size   = s->size;
+   TG_(current_fn_stack).bottom = s->bottom;
+   TG_(current_fn_stack).top    = s->top;
 }
 
 static cxt_hash cxts;
@@ -70,94 +69,94 @@ static cxt_hash cxts;
 void TG_(init_cxt_table)(void)
 {
    Int i;
-   
+
    cxts.size    = N_CXT_INITIAL_ENTRIES;
    cxts.entries = 0;
-   cxts.table   = (Context**) TG_MALLOC("cl.context.ict.1",
-                                         cxts.size * sizeof(Context*));
+   cxts.table =
+      (Context**)TG_MALLOC("cl.context.ict.1", cxts.size * sizeof(Context*));
 
    for (i = 0; i < cxts.size; i++)
-     cxts.table[i] = 0;
+      cxts.table[i] = 0;
 }
 
 /* double size of cxt table  */
 static void resize_cxt_table(void)
 {
-    UInt i, new_size, conflicts1 = 0, conflicts2 = 0;
-    Context **new_table, *curr, *next;
-    UInt new_idx;
+   UInt      i, new_size, conflicts1 = 0, conflicts2 = 0;
+   Context **new_table, *curr, *next;
+   UInt      new_idx;
 
-    new_size  = 2* cxts.size +3;
-    new_table = (Context**) TG_MALLOC("cl.context.rct.1",
-                                       new_size * sizeof(Context*));
+   new_size = 2 * cxts.size + 3;
+   new_table =
+      (Context**)TG_MALLOC("cl.context.rct.1", new_size * sizeof(Context*));
 
-    for (i = 0; i < new_size; i++)
+   for (i = 0; i < new_size; i++)
       new_table[i] = NULL;
 
-    for (i = 0; i < cxts.size; i++) {
-        if (cxts.table[i] == NULL) continue;
+   for (i = 0; i < cxts.size; i++) {
+      if (cxts.table[i] == NULL)
+         continue;
 
-        curr = cxts.table[i];
-        while (NULL != curr) {
-            next = curr->next;
+      curr = cxts.table[i];
+      while (NULL != curr) {
+         next = curr->next;
 
-            new_idx = (UInt) (curr->hash % new_size);
+         new_idx = (UInt)(curr->hash % new_size);
 
-            curr->next = new_table[new_idx];
-            new_table[new_idx] = curr;
-            if (curr->next) {
-                conflicts1++;
-                if (curr->next->next)
-                    conflicts2++;
-            }
+         curr->next         = new_table[new_idx];
+         new_table[new_idx] = curr;
+         if (curr->next) {
+            conflicts1++;
+            if (curr->next->next)
+               conflicts2++;
+         }
 
-            curr = next;
-        }
-    }
+         curr = next;
+      }
+   }
 
-    VG_(free)(cxts.table);
+   VG_(free)(cxts.table);
 
+   TG_DEBUG(0, "Resize Context Hash: %u => %u (entries %u, conflicts %u/%u)\n",
+            cxts.size, new_size, cxts.entries, conflicts1, conflicts2);
 
-    TG_DEBUG(0, "Resize Context Hash: %u => %u (entries %u, conflicts %u/%u)\n",
-             cxts.size, new_size,
-             cxts.entries, conflicts1, conflicts2);
-
-    cxts.size  = new_size;
-    cxts.table = new_table;
-    TG_(stat).cxt_hash_resizes++;
+   cxts.size  = new_size;
+   cxts.table = new_table;
+   TG_(stat).cxt_hash_resizes++;
 }
 
-__inline__
-static UWord cxt_hash_val(fn_node** fn, UInt size)
+__inline__ static UWord cxt_hash_val(fn_node** fn, UInt size)
 {
-    UWord hash = 0;
-    UInt count = size;
-    while(*fn != 0) {
-        hash = (hash<<7) + (hash>>25) + (UWord)(*fn);
-        fn--;
-        count--;
-        if (count==0) break;
-    }
-    return hash;
+   UWord hash  = 0;
+   UInt  count = size;
+   while (*fn != 0) {
+      hash = (hash << 7) + (hash >> 25) + (UWord)(*fn);
+      fn--;
+      count--;
+      if (count == 0)
+         break;
+   }
+   return hash;
 }
 
-__inline__
-static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt)
+__inline__ static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt)
 {
-    int count;
-    fn_node** cxt_fn;
-
-    if (hash != cxt->hash) return False;
-
-    count = cxt->size;
-    cxt_fn = &(cxt->fn[0]);
-    while((*fn != 0) && (count>0)) {
-        if (*cxt_fn != *fn) return False;
-        fn--;
-        cxt_fn++;
-        count--;
-    }
-    return True;
+   int       count;
+   fn_node** cxt_fn;
+
+   if (hash != cxt->hash)
+      return False;
+
+   count  = cxt->size;
+   cxt_fn = &(cxt->fn[0]);
+   while ((*fn != 0) && (count > 0)) {
+      if (*cxt_fn != *fn)
+         return False;
+      fn--;
+      cxt_fn++;
+      count--;
+   }
+   return True;
 }
 
 /**
@@ -165,166 +164,172 @@ static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt)
  */
 static Context* new_cxt(fn_node** fn)
 {
-    Context* cxt;
-    UInt idx, offset;
-    UWord hash;
-    int size, recs;
-    fn_node* top_fn;
-
-    TG_ASSERT(fn);
-    top_fn = *fn;
-    if (top_fn == 0) return 0;
-
-    size = top_fn->separate_callers +1;
-    recs = top_fn->separate_recursions;
-    if (recs<1) recs=1;
-
-    /* check fill degree of context hash table and resize if needed (>80%) */
-    cxts.entries++;
-    if (10 * cxts.entries / cxts.size > 8)
-        resize_cxt_table();
-
-    cxt = (Context*) TG_MALLOC("cl.context.nc.1",
-                                sizeof(Context)+sizeof(fn_node*)*size);
-
-    // hash value calculation similar to cxt_hash_val(), but additionally
-    // copying function pointers in one run
-    hash = 0;
-    offset = 0;
-    while(*fn != 0) {
-        hash = (hash<<7) + (hash>>25) + (UWord)(*fn);
-	cxt->fn[offset] = *fn;
-        offset++;
-        fn--;
-        if (offset >= size) break;
-    }
-    if (offset < size) size = offset;
-
-    cxt->size        = size;
-    cxt->base_number = TG_(stat).context_counter;
-    cxt->hash        = hash;
-
-    TG_(stat).context_counter += recs;
-    TG_(stat).distinct_contexts++;
-
-    /* insert into Context hash table */
-    idx = (UInt) (hash % cxts.size);
-    cxt->next = cxts.table[idx];
-    cxts.table[idx] = cxt;
+   Context* cxt;
+   UInt     idx, offset;
+   UWord    hash;
+   int      size, recs;
+   fn_node* top_fn;
+
+   TG_ASSERT(fn);
+   top_fn = *fn;
+   if (top_fn == 0)
+      return 0;
+
+   size = top_fn->separate_callers + 1;
+   recs = top_fn->separate_recursions;
+   if (recs < 1)
+      recs = 1;
+
+   /* check fill degree of context hash table and resize if needed (>80%) */
+   cxts.entries++;
+   if (10 * cxts.entries / cxts.size > 8)
+      resize_cxt_table();
+
+   cxt = (Context*)TG_MALLOC("cl.context.nc.1",
+                             sizeof(Context) + sizeof(fn_node*) * size);
+
+   // hash value calculation similar to cxt_hash_val(), but additionally
+   // copying function pointers in one run
+   hash   = 0;
+   offset = 0;
+   while (*fn != 0) {
+      hash            = (hash << 7) + (hash >> 25) + (UWord)(*fn);
+      cxt->fn[offset] = *fn;
+      offset++;
+      fn--;
+      if (offset >= size)
+         break;
+   }
+   if (offset < size)
+      size = offset;
+
+   cxt->size        = size;
+   cxt->base_number = TG_(stat).context_counter;
+   cxt->hash        = hash;
+
+   TG_(stat).context_counter += recs;
+   TG_(stat).distinct_contexts++;
+
+   /* insert into Context hash table */
+   idx             = (UInt)(hash % cxts.size);
+   cxt->next       = cxts.table[idx];
+   cxts.table[idx] = cxt;
 
 #if TG_ENABLE_DEBUG
-    TG_DEBUGIF(3) {
+   TG_DEBUGIF(3)
+   {
       VG_(printf)("  new_cxt ox%p: ", cxt);
       TG_(print_cxt)(12, cxt, 0);
-    }
+   }
 #endif
 
-    return cxt;
+   return cxt;
 }
 
 /* get the Context structure for current context */
 Context* TG_(get_cxt)(fn_node** fn)
 {
-    Context* cxt;
-    UInt size, idx;
-    UWord hash;
+   Context* cxt;
+   UInt     size, idx;
+   UWord    hash;
 
-    TG_ASSERT(fn != 0);
-    if (*fn == 0) return 0;
-    size = (*fn)->separate_callers+1;
-    if (size<=0) { size = -size+1; }
+   TG_ASSERT(fn != 0);
+   if (*fn == 0)
+      return 0;
+   size = (*fn)->separate_callers + 1;
+   if (size <= 0) {
+      size = -size + 1;
+   }
 
-    TG_DEBUG(5, "+ get_cxt(fn '%s'): size %u\n",
-                (*fn)->name, size);
+   TG_DEBUG(5, "+ get_cxt(fn '%s'): size %u\n", (*fn)->name, size);
 
-    hash = cxt_hash_val(fn, size);
+   hash = cxt_hash_val(fn, size);
 
-    if ( ((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) {
-        TG_DEBUG(5, "- get_cxt: %p\n", cxt);
-        return cxt;
-    }
+   if (((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) {
+      TG_DEBUG(5, "- get_cxt: %p\n", cxt);
+      return cxt;
+   }
 
-    TG_(stat).cxt_lru_misses++;
+   TG_(stat).cxt_lru_misses++;
 
-    idx = (UInt) (hash % cxts.size);
-    cxt = cxts.table[idx];
+   idx = (UInt)(hash % cxts.size);
+   cxt = cxts.table[idx];
 
-    while(cxt) {
-        if (is_cxt(hash,fn,cxt)) break;
-        cxt = cxt->next;
-    }
+   while (cxt) {
+      if (is_cxt(hash, fn, cxt))
+         break;
+      cxt = cxt->next;
+   }
 
-    if (!cxt)
-        cxt = new_cxt(fn);
+   if (!cxt)
+      cxt = new_cxt(fn);
 
-    (*fn)->last_cxt = cxt;
+   (*fn)->last_cxt = cxt;
 
-    TG_DEBUG(5, "- get_cxt: %p\n", cxt);
+   TG_DEBUG(5, "- get_cxt: %p\n", cxt);
 
-    return cxt;
+   return cxt;
 }
 
-
 /**
  * Change execution context by calling a new function from current context
  * Pushing 0x0 specifies a marker for a signal handler entry
  */
 void TG_(push_cxt)(fn_node* fn)
 {
-  call_stack* cs = &TG_(current_call_stack);
-  Int fn_entries;
-
-  TG_DEBUG(5, "+ push_cxt(fn '%s'): old ctx %d\n", 
-	    fn ? fn->name : "0x0",
-	    TG_(current_state).cxt ?
-	    (Int)TG_(current_state).cxt->base_number : -1);
-
-  /* save old context on stack (even if not changed at all!) */
-  TG_ASSERT(cs->sp < cs->size);
-  TG_ASSERT(cs->entry[cs->sp].cxt == 0);
-  cs->entry[cs->sp].cxt = TG_(current_state).cxt;
-  cs->entry[cs->sp].fn_sp = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
-
-  if (fn && (*(TG_(current_fn_stack).top) == fn)) return;
-  if (fn && (fn->group>0) &&
-      ((*(TG_(current_fn_stack).top))->group == fn->group)) return;
-
-  /* resizing needed ? */
-  fn_entries = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
-  if (fn_entries == TG_(current_fn_stack).size-1) {
-    UInt new_size = TG_(current_fn_stack).size *2;
-    fn_node** new_array = (fn_node**) TG_MALLOC("cl.context.pc.1",
-						 new_size * sizeof(fn_node*));
-    int i;
-    for(i=0;i<TG_(current_fn_stack).size;i++)
-      new_array[i] = TG_(current_fn_stack).bottom[i];
-    VG_(free)(TG_(current_fn_stack).bottom);
-    TG_(current_fn_stack).top = new_array + fn_entries;
-    TG_(current_fn_stack).bottom = new_array;
-
-    TG_DEBUG(0, "Resize Context Stack: %u => %u (pushing '%s')\n", 
-	     TG_(current_fn_stack).size, new_size,
-	     fn ? fn->name : "0x0");
-
-    TG_(current_fn_stack).size = new_size;
-  }
-
-  if (fn && (*(TG_(current_fn_stack).top) == 0)) {
-    UInt *pactive;
-
-    /* this is first function: increment its active count */
-    pactive = TG_(get_fn_entry)(fn->number);
-    (*pactive)++;
-  }
-
-  TG_(current_fn_stack).top++;
-  *(TG_(current_fn_stack).top) = fn;
-  TG_(current_state).cxt = TG_(get_cxt)(TG_(current_fn_stack).top);
-
-  TG_DEBUG(5, "- push_cxt(fn '%s'): new cxt %d, fn_sp %ld\n",
-	    fn ? fn->name : "0x0",
-	    TG_(current_state).cxt ?
-	    (Int)TG_(current_state).cxt->base_number : -1,
-	    TG_(current_fn_stack).top - TG_(current_fn_stack).bottom + 0L);
+   call_stack* cs = &TG_(current_call_stack);
+   Int         fn_entries;
+
+   TG_DEBUG(5, "+ push_cxt(fn '%s'): old ctx %d\n", fn ? fn->name : "0x0",
+            TG_(current_state).cxt ? (Int)TG_(current_state).cxt->base_number
+                                   : -1);
+
+   /* save old context on stack (even if not changed at all!) */
+   TG_ASSERT(cs->sp < cs->size);
+   TG_ASSERT(cs->entry[cs->sp].cxt == 0);
+   cs->entry[cs->sp].cxt = TG_(current_state).cxt;
+   cs->entry[cs->sp].fn_sp =
+      TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
+
+   if (fn && (*(TG_(current_fn_stack).top) == fn))
+      return;
+   if (fn && (fn->group > 0) &&
+       ((*(TG_(current_fn_stack).top))->group == fn->group))
+      return;
+
+   /* resizing needed ? */
+   fn_entries = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
+   if (fn_entries == TG_(current_fn_stack).size - 1) {
+      UInt      new_size = TG_(current_fn_stack).size * 2;
+      fn_node** new_array =
+         (fn_node**)TG_MALLOC("cl.context.pc.1", new_size * sizeof(fn_node*));
+      int i;
+      for (i = 0; i < TG_(current_fn_stack).size; i++)
+         new_array[i] = TG_(current_fn_stack).bottom[i];
+      VG_(free)(TG_(current_fn_stack).bottom);
+      TG_(current_fn_stack).top    = new_array + fn_entries;
+      TG_(current_fn_stack).bottom = new_array;
+
+      TG_DEBUG(0, "Resize Context Stack: %u => %u (pushing '%s')\n",
+               TG_(current_fn_stack).size, new_size, fn ? fn->name : "0x0");
+
+      TG_(current_fn_stack).size = new_size;
+   }
+
+   if (fn && (*(TG_(current_fn_stack).top) == 0)) {
+      UInt* pactive;
+
+      /* this is first function: increment its active count */
+      pactive = TG_(get_fn_entry)(fn->number);
+      (*pactive)++;
+   }
+
+   TG_(current_fn_stack).top++;
+   *(TG_(current_fn_stack).top) = fn;
+   TG_(current_state).cxt       = TG_(get_cxt)(TG_(current_fn_stack).top);
+
+   TG_DEBUG(
+      5, "- push_cxt(fn '%s'): new cxt %d, fn_sp %ld\n", fn ? fn->name : "0x0",
+      TG_(current_state).cxt ? (Int)TG_(current_state).cxt->base_number : -1,
+      TG_(current_fn_stack).top - TG_(current_fn_stack).bottom + 0L);
 }
-			       
diff --git a/tracegrind/costs.c b/tracegrind/costs.c
index 765081b0a..bc7cd41eb 100644
--- a/tracegrind/costs.c
+++ b/tracegrind/costs.c
@@ -30,39 +30,39 @@
 
 #define COSTCHUNK_SIZE 100000
 
-UInt TG_(costarray_entries) = 0;
-UInt TG_(costarray_chunks) = 0;
-static CostChunk* cost_chunk_base = 0;
-static CostChunk* cost_chunk_current = 0;
+UInt              TG_(costarray_entries) = 0;
+UInt              TG_(costarray_chunks)  = 0;
+static CostChunk* cost_chunk_base        = 0;
+static CostChunk* cost_chunk_current     = 0;
 
 ULong* TG_(get_costarray)(Int size)
 {
-  ULong* ptr;
+   ULong* ptr;
 
-  if (!cost_chunk_current ||
-      (cost_chunk_current->size - cost_chunk_current->used < size)) {
-    CostChunk* cc  = (CostChunk*) TG_MALLOC("cl.costs.gc.1",
-                                              sizeof(CostChunk) +
-					      COSTCHUNK_SIZE * sizeof(ULong));
-    TG_ASSERT(size < COSTCHUNK_SIZE);
+   if (!cost_chunk_current ||
+       (cost_chunk_current->size - cost_chunk_current->used < size)) {
+      CostChunk* cc = (CostChunk*)TG_MALLOC(
+         "cl.costs.gc.1", sizeof(CostChunk) + COSTCHUNK_SIZE * sizeof(ULong));
+      TG_ASSERT(size < COSTCHUNK_SIZE);
 
-    cc->size = COSTCHUNK_SIZE;
-    cc->used = 0;
-    cc->next = 0;
+      cc->size = COSTCHUNK_SIZE;
+      cc->used = 0;
+      cc->next = 0;
 
-    if (cost_chunk_current)
-      cost_chunk_current->next = cc;
-    cost_chunk_current = cc;
+      if (cost_chunk_current)
+         cost_chunk_current->next = cc;
+      cost_chunk_current = cc;
 
-    if (!cost_chunk_base) cost_chunk_base = cc;
+      if (!cost_chunk_base)
+         cost_chunk_base = cc;
 
-    TG_(costarray_chunks)++;
-  }
-  
-  ptr = &(cost_chunk_current->data[cost_chunk_current->used]);
-  cost_chunk_current->used += size;
+      TG_(costarray_chunks)++;
+   }
 
-  TG_(costarray_entries) += size;
+   ptr = &(cost_chunk_current->data[cost_chunk_current->used]);
+   cost_chunk_current->used += size;
 
-  return ptr;
+   TG_(costarray_entries) += size;
+
+   return ptr;
 }
diff --git a/tracegrind/costs.h b/tracegrind/costs.h
index eedf60c83..2e51c344d 100644
--- a/tracegrind/costs.h
+++ b/tracegrind/costs.h
@@ -25,13 +25,12 @@
    The GNU General Public License is contained in the file COPYING.
 */
 
-
 #ifndef TG_COSTS
 #define TG_COSTS
 
 #include "pub_tool_basics.h"
 
-#define TG_(str) VGAPPEND(vgTracegrind_,str)
+#define TG_(str) VGAPPEND(vgTracegrind_, str)
 
 extern UInt TG_(costarray_entries);
 extern UInt TG_(costarray_chunks);
@@ -42,10 +41,10 @@ extern UInt TG_(costarray_chunks);
  */
 typedef struct _CostChunk CostChunk;
 struct _CostChunk {
-  Int size;
-  Int used;
-  CostChunk *next, *prev;
-  ULong data[0];
+   Int        size;
+   Int        used;
+   CostChunk *next, *prev;
+   ULong      data[0];
 };
 
 /* Allocate a number of 64bit cost values.
diff --git a/tracegrind/debug.c b/tracegrind/debug.c
index 940a9e803..fa8f876e2 100644
--- a/tracegrind/debug.c
+++ b/tracegrind/debug.c
@@ -23,8 +23,8 @@
    The GNU General Public License is contained in the file COPYING.
 */
 
-#include "global.h"
 #include "events.h"
+#include "global.h"
 
 /* If debugging mode of, dummy functions are provided (see below)
  */
@@ -36,246 +36,240 @@
 
 static void print_indent(int s)
 {
-    /* max of 40 spaces */
-    const HChar sp[] = "                                        ";
-    if (s>40) s=40;
-    VG_(printf)("%s", sp+40-s);
+   /* max of 40 spaces */
+   const HChar sp[] = "                                        ";
+   if (s > 40)
+      s = 40;
+   VG_(printf)("%s", sp + 40 - s);
 }
 
 void TG_(print_bb)(int s, BB* bb)
 {
-    if (s<0) {
-	s = -s;
-	print_indent(s);
-    }
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
 
-    VG_(printf)("BB %#lx (Obj '%s')", bb_addr(bb), bb->obj->name);
+   VG_(printf)("BB %#lx (Obj '%s')", bb_addr(bb), bb->obj->name);
 }
 
-static
-void print_mangled_cxt(Context* cxt, int rec_index)
+static void print_mangled_cxt(Context* cxt, int rec_index)
 {
-    int i;
+   int i;
 
-    if (!cxt)
+   if (!cxt)
       VG_(printf)("(none)");
-    else {
+   else {
       VG_(printf)("%s", cxt->fn[0]->name);
-      if (rec_index >0)
-	VG_(printf)("'%d", rec_index +1);
-      for(i=1;i<cxt->size;i++)
-	VG_(printf)("'%s", cxt->fn[i]->name);
-    }
+      if (rec_index > 0)
+         VG_(printf)("'%d", rec_index + 1);
+      for (i = 1; i < cxt->size; i++)
+         VG_(printf)("'%s", cxt->fn[i]->name);
+   }
 }
 
-
-
 void TG_(print_cxt)(Int s, Context* cxt, int rec_index)
 {
-  if (s<0) {
-    s = -s;
-    print_indent(s);
-  }
-  
-  if (cxt) {
-    UInt *pactive = TG_(get_fn_entry)(cxt->fn[0]->number);
-    TG_ASSERT(rec_index < cxt->fn[0]->separate_recursions);
-    
-    VG_(printf)("Cxt %u" ,cxt->base_number + rec_index);
-    if (*pactive>0)
-      VG_(printf)(" [active=%u]", *pactive);
-    VG_(printf)(": ");	
-    print_mangled_cxt(cxt, rec_index);
-    VG_(printf)("\n");
-  }
-  else
-    VG_(printf)("(no context)\n");
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (cxt) {
+      UInt* pactive = TG_(get_fn_entry)(cxt->fn[0]->number);
+      TG_ASSERT(rec_index < cxt->fn[0]->separate_recursions);
+
+      VG_(printf)("Cxt %u", cxt->base_number + rec_index);
+      if (*pactive > 0)
+         VG_(printf)(" [active=%u]", *pactive);
+      VG_(printf)(": ");
+      print_mangled_cxt(cxt, rec_index);
+      VG_(printf)("\n");
+   } else
+      VG_(printf)("(no context)\n");
 }
 
 void TG_(print_execstate)(int s, exec_state* es)
 {
-  if (s<0) {
-    s = -s;
-    print_indent(s);
-  }
-  
-  if (!es) {
-    VG_(printf)("ExecState 0x0\n");
-    return;
-  }
-
-  VG_(printf)("ExecState [Sig %d, collect %s, nonskipped %p]: jmps_passed %d\n",
-	      es->sig, es->collect?"yes":"no",
-	      es->nonskipped, es->jmps_passed);
-}
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
 
+   if (!es) {
+      VG_(printf)("ExecState 0x0\n");
+      return;
+   }
+
+   VG_(printf)(
+      "ExecState [Sig %d, collect %s, nonskipped %p]: jmps_passed %d\n",
+      es->sig, es->collect ? "yes" : "no", es->nonskipped, es->jmps_passed);
+}
 
 void TG_(print_bbcc)(int s, BBCC* bbcc)
 {
-  BB* bb;
-
-  if (s<0) {
-    s = -s;
-    print_indent(s);
-  }
-  
-  if (!bbcc) {
-    VG_(printf)("BBCC 0x0\n");
-    return;
-  }
- 
-  bb = bbcc->bb;
-  TG_ASSERT(bb!=0);
-
-  VG_(printf)("%s +%#lx=%#lx, ",
-	      bb->obj->name + bb->obj->last_slash_pos,
-	      (UWord)bb->offset, bb_addr(bb));
-  TG_(print_cxt)(s+8, bbcc->cxt, bbcc->rec_index);
+   BB* bb;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!bbcc) {
+      VG_(printf)("BBCC 0x0\n");
+      return;
+   }
+
+   bb = bbcc->bb;
+   TG_ASSERT(bb != 0);
+
+   VG_(printf)("%s +%#lx=%#lx, ", bb->obj->name + bb->obj->last_slash_pos,
+               (UWord)bb->offset, bb_addr(bb));
+   TG_(print_cxt)(s + 8, bbcc->cxt, bbcc->rec_index);
 }
 
 void TG_(print_eventset)(int s, EventSet* es)
 {
-    int i, j;
-    UInt mask;
-    EventGroup* eg;
-
-    if (s<0) {
-	s = -s;
-	print_indent(s);
-    }
-
-    if (!es) {
-	VG_(printf)("(EventSet not set)\n");
-	return;
-    }
-
-    VG_(printf)("EventSet %u (%d groups, size %d):",
-		es->mask, es->count, es->size);
-
-    if (es->count == 0) {
-	VG_(printf)("-\n");
-	return;
-    }
-
-    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
-	if ((es->mask & mask)==0) continue;
-	eg = TG_(get_event_group)(i);
-	if (!eg) continue;
-	VG_(printf)(" (%d: %s", i, eg->name[0]);
-	for(j=1; j<eg->size; j++)
-	    VG_(printf)(" %s", eg->name[j]);
-	VG_(printf)(")");
-    }
-    VG_(printf)("\n");
-}
+   int         i, j;
+   UInt        mask;
+   EventGroup* eg;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!es) {
+      VG_(printf)("(EventSet not set)\n");
+      return;
+   }
 
+   VG_(printf)("EventSet %u (%d groups, size %d):", es->mask, es->count,
+               es->size);
+
+   if (es->count == 0) {
+      VG_(printf)("-\n");
+      return;
+   }
+
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((es->mask & mask) == 0)
+         continue;
+      eg = TG_(get_event_group)(i);
+      if (!eg)
+         continue;
+      VG_(printf)(" (%d: %s", i, eg->name[0]);
+      for (j = 1; j < eg->size; j++)
+         VG_(printf)(" %s", eg->name[j]);
+      VG_(printf)(")");
+   }
+   VG_(printf)("\n");
+}
 
 void TG_(print_cost)(int s, EventSet* es, ULong* c)
 {
-    Int i, j, pos, off;
-    UInt mask;
-    EventGroup* eg;
+   Int         i, j, pos, off;
+   UInt        mask;
+   EventGroup* eg;
 
-    if (s<0) {
-	s = -s;
-	print_indent(s);
-    }
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
 
-    if (!es) {
+   if (!es) {
       VG_(printf)("Cost (Nothing, EventSet not set)\n");
       return;
-    }
-    if (!c) {
+   }
+   if (!c) {
       VG_(printf)("Cost (Null, EventSet %u)\n", es->mask);
       return;
-    }
+   }
 
-    if (es->size == 0) {
+   if (es->size == 0) {
       VG_(printf)("Cost (Nothing, EventSet with len 0)\n");
       return;
-    } 
-
-    pos = s;
-    pos += VG_(printf)("Cost [%p]: ", c);
-    off = 0;
-    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
-	if ((es->mask & mask)==0) continue;
-	eg = TG_(get_event_group)(i);
-	if (!eg) continue;
-	for(j=0; j<eg->size; j++) {
-
-	    if (off>0) {
-		if (pos > 70) {
-		    VG_(printf)(",\n");
-		    print_indent(s+5);
-		    pos = s+5;
-		}
-		else
-		    pos += VG_(printf)(", ");
-	    }
-
-	    pos += VG_(printf)("%s %llu", eg->name[j], c[off++]);
-	}
-    }
-    VG_(printf)("\n");
+   }
+
+   pos = s;
+   pos += VG_(printf)("Cost [%p]: ", c);
+   off = 0;
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((es->mask & mask) == 0)
+         continue;
+      eg = TG_(get_event_group)(i);
+      if (!eg)
+         continue;
+      for (j = 0; j < eg->size; j++) {
+
+         if (off > 0) {
+            if (pos > 70) {
+               VG_(printf)(",\n");
+               print_indent(s + 5);
+               pos = s + 5;
+            } else
+               pos += VG_(printf)(", ");
+         }
+
+         pos += VG_(printf)("%s %llu", eg->name[j], c[off++]);
+      }
+   }
+   VG_(printf)("\n");
 }
 
-
 void TG_(print_short_jcc)(jCC* jcc)
 {
-    if (jcc)
-	VG_(printf)("%#lx => %#lx [calls %llu/Ir %llu, Dr %llu, Dw %llu]",
-		    bb_jmpaddr(jcc->from->bb),
-		    bb_addr(jcc->to->bb),
-		    jcc->call_counter,
-		    jcc->cost ? jcc->cost[fullOffset(EG_IR)]:0,
-		    jcc->cost ? jcc->cost[fullOffset(EG_DR)]:0,
-		    jcc->cost ? jcc->cost[fullOffset(EG_DW)]:0);
-    else
-	VG_(printf)("[Skipped JCC]");
+   if (jcc)
+      VG_(printf)("%#lx => %#lx [calls %llu/Ir %llu, Dr %llu, Dw %llu]",
+                  bb_jmpaddr(jcc->from->bb), bb_addr(jcc->to->bb),
+                  jcc->call_counter,
+                  jcc->cost ? jcc->cost[fullOffset(EG_IR)] : 0,
+                  jcc->cost ? jcc->cost[fullOffset(EG_DR)] : 0,
+                  jcc->cost ? jcc->cost[fullOffset(EG_DW)] : 0);
+   else
+      VG_(printf)("[Skipped JCC]");
 }
 
 void TG_(print_jcc)(int s, jCC* jcc)
 {
-    if (s<0) {
-	s = -s;
-	print_indent(s);
-    }
-
-    if (!jcc) {
-	VG_(printf)("JCC to skipped function\n");
-	return;
-    }
-    VG_(printf)("JCC %p from ", jcc);
-    TG_(print_bbcc)(s+9, jcc->from);
-    print_indent(s+4);    
-    VG_(printf)("to   ");
-    TG_(print_bbcc)(s+9, jcc->to);
-    print_indent(s+4);
-    VG_(printf)("Calls %llu\n", jcc->call_counter);
-    print_indent(s+4);
-    TG_(print_cost)(s+9, TG_(sets).full, jcc->cost);
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!jcc) {
+      VG_(printf)("JCC to skipped function\n");
+      return;
+   }
+   VG_(printf)("JCC %p from ", jcc);
+   TG_(print_bbcc)(s + 9, jcc->from);
+   print_indent(s + 4);
+   VG_(printf)("to   ");
+   TG_(print_bbcc)(s + 9, jcc->to);
+   print_indent(s + 4);
+   VG_(printf)("Calls %llu\n", jcc->call_counter);
+   print_indent(s + 4);
+   TG_(print_cost)(s + 9, TG_(sets).full, jcc->cost);
 }
 
 /* dump out the current call stack */
 void TG_(print_stackentry)(int s, int sp)
 {
-    call_entry* ce;
-
-    if (s<0) {
-	s = -s;
-	print_indent(s);
-    }
-
-    ce = TG_(get_call_entry)(sp);
-    VG_(printf)("[%-2d] SP %#lx, RA %#lx", sp, ce->sp, ce->ret_addr);
-    if (ce->nonskipped)
-	VG_(printf)(" NonSkipped BB %#lx / %s",
-		    bb_addr(ce->nonskipped->bb),
-		    ce->nonskipped->cxt->fn[0]->name);
-    VG_(printf)("\n");
-    print_indent(s+5);
-    TG_(print_jcc)(5,ce->jcc);
+   call_entry* ce;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   ce = TG_(get_call_entry)(sp);
+   VG_(printf)("[%-2d] SP %#lx, RA %#lx", sp, ce->sp, ce->ret_addr);
+   if (ce->nonskipped)
+      VG_(printf)(" NonSkipped BB %#lx / %s", bb_addr(ce->nonskipped->bb),
+                  ce->nonskipped->cxt->fn[0]->name);
+   VG_(printf)("\n");
+   print_indent(s + 5);
+   TG_(print_jcc)(5, ce->jcc);
 }
 
 /* debug output */
@@ -292,154 +286,151 @@ static void print_call_stack()
 
 void TG_(print_bbcc_fn)(BBCC* bbcc)
 {
-    obj_node* obj;
-
-    if (!bbcc) {
-	VG_(printf)("%08x", 0u);
-	return;
-    }
-
-    VG_(printf)("%08lx/%c  %u:", bb_addr(bbcc->bb), 
-		(bbcc->bb->sect_kind == Vg_SectText) ? 'T' :
-		(bbcc->bb->sect_kind == Vg_SectData) ? 'D' :
-		(bbcc->bb->sect_kind == Vg_SectBSS) ? 'B' :
-		(bbcc->bb->sect_kind == Vg_SectGOT) ? 'G' :
-		(bbcc->bb->sect_kind == Vg_SectPLT) ? 'P' : 'U',
-		bbcc->cxt->base_number+bbcc->rec_index);
-    print_mangled_cxt(bbcc->cxt, bbcc->rec_index);
-
-    obj = bbcc->cxt->fn[0]->file->obj;
-    if (obj->name[0])
-	VG_(printf)(" %s", obj->name+obj->last_slash_pos);
-
-    if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") !=0) {
-	VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name);
-	if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line>0))
-	    VG_(printf)(":%u", bbcc->bb->line);
-    }
-}	
+   obj_node* obj;
+
+   if (!bbcc) {
+      VG_(printf)("%08x", 0u);
+      return;
+   }
+
+   VG_(printf)("%08lx/%c  %u:", bb_addr(bbcc->bb),
+               (bbcc->bb->sect_kind == Vg_SectText)   ? 'T'
+               : (bbcc->bb->sect_kind == Vg_SectData) ? 'D'
+               : (bbcc->bb->sect_kind == Vg_SectBSS)  ? 'B'
+               : (bbcc->bb->sect_kind == Vg_SectGOT)  ? 'G'
+               : (bbcc->bb->sect_kind == Vg_SectPLT)  ? 'P'
+                                                      : 'U',
+               bbcc->cxt->base_number + bbcc->rec_index);
+   print_mangled_cxt(bbcc->cxt, bbcc->rec_index);
+
+   obj = bbcc->cxt->fn[0]->file->obj;
+   if (obj->name[0])
+      VG_(printf)(" %s", obj->name + obj->last_slash_pos);
+
+   if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") != 0) {
+      VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name);
+      if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line > 0))
+         VG_(printf)(":%u", bbcc->bb->line);
+   }
+}
 
 void TG_(print_bbcc_cost)(int s, BBCC* bbcc)
 {
-  BB* bb;
-  Int i, cjmpNo;
-  ULong ecounter;
-
-  if (s<0) {
-    s = -s;
-    print_indent(s);
-  }
-  
-  if (!bbcc) {
-    VG_(printf)("BBCC 0x0\n");
-    return;
-  }
- 
-  bb = bbcc->bb;
-  TG_ASSERT(bb!=0);
-    
-  TG_(print_bbcc)(s, bbcc);
-
-  ecounter = bbcc->ecounter_sum;
-
-  print_indent(s+2);
-  VG_(printf)("ECounter: sum %llu ", ecounter);
-  for(i=0; i<bb->cjmp_count; i++) {
-      VG_(printf)("[%u]=%llu ",
-		  bb->jmp[i].instr, bbcc->jmp[i].ecounter);
-  }
-  VG_(printf)("\n");
-
-  cjmpNo = 0; 
-  for(i=0; i<bb->instr_count; i++) {
+   BB*   bb;
+   Int   i, cjmpNo;
+   ULong ecounter;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!bbcc) {
+      VG_(printf)("BBCC 0x0\n");
+      return;
+   }
+
+   bb = bbcc->bb;
+   TG_ASSERT(bb != 0);
+
+   TG_(print_bbcc)(s, bbcc);
+
+   ecounter = bbcc->ecounter_sum;
+
+   print_indent(s + 2);
+   VG_(printf)("ECounter: sum %llu ", ecounter);
+   for (i = 0; i < bb->cjmp_count; i++) {
+      VG_(printf)("[%u]=%llu ", bb->jmp[i].instr, bbcc->jmp[i].ecounter);
+   }
+   VG_(printf)("\n");
+
+   cjmpNo = 0;
+   for (i = 0; i < bb->instr_count; i++) {
       InstrInfo* ii = &(bb->instr[i]);
-      print_indent(s+2);
-      VG_(printf)("[%2d] IOff %2u ecnt %3llu ",
-		  i, ii->instr_offset, ecounter);
-      TG_(print_cost)(s+5, ii->eventset, bbcc->cost + ii->cost_offset);
+      print_indent(s + 2);
+      VG_(printf)("[%2d] IOff %2u ecnt %3llu ", i, ii->instr_offset, ecounter);
+      TG_(print_cost)(s + 5, ii->eventset, bbcc->cost + ii->cost_offset);
 
       /* update execution counter */
       if (cjmpNo < bb->cjmp_count)
-	  if (bb->jmp[cjmpNo].instr == i) {
-	      ecounter -= bbcc->jmp[cjmpNo].ecounter;
-	      cjmpNo++;
-	  }
-  }
+         if (bb->jmp[cjmpNo].instr == i) {
+            ecounter -= bbcc->jmp[cjmpNo].ecounter;
+            cjmpNo++;
+         }
+   }
 }
 
-
 /* dump out an address with source info if available */
 void TG_(print_addr)(Addr addr)
 {
-    const HChar *fn_buf, *fl_buf, *dir_buf;
-    const HChar* obj_name;
-    DebugInfo* di;
-    UInt ln, i=0, opos=0;
-	
-    if (addr == 0) {
-	VG_(printf)("%08lx", addr);
-	return;
-    }
-
-    TG_(get_debug_info)(addr, &dir_buf, &fl_buf, &fn_buf, &ln, &di);
-
-    if (VG_(strcmp)(fn_buf,"???")==0)
-	VG_(printf)("%#lx", addr);
-    else
-	VG_(printf)("%#lx %s", addr, fn_buf);
-
-    if (di) {
+   const HChar *fn_buf, *fl_buf, *dir_buf;
+   const HChar* obj_name;
+   DebugInfo*   di;
+   UInt         ln, i = 0, opos = 0;
+
+   if (addr == 0) {
+      VG_(printf)("%08lx", addr);
+      return;
+   }
+
+   TG_(get_debug_info)(addr, &dir_buf, &fl_buf, &fn_buf, &ln, &di);
+
+   if (VG_(strcmp)(fn_buf, "???") == 0)
+      VG_(printf)("%#lx", addr);
+   else
+      VG_(printf)("%#lx %s", addr, fn_buf);
+
+   if (di) {
       obj_name = VG_(DebugInfo_get_filename)(di);
       if (obj_name) {
-	while(obj_name[i]) {
-	  if (obj_name[i]=='/') opos = i+1;
-	  i++;
-	}
-	if (obj_name[0])
-	  VG_(printf)(" %s", obj_name+opos);
+         while (obj_name[i]) {
+            if (obj_name[i] == '/')
+               opos = i + 1;
+            i++;
+         }
+         if (obj_name[0])
+            VG_(printf)(" %s", obj_name + opos);
       }
-    }
-
-    if (ln>0) {
-       if (dir_buf[0])
-          VG_(printf)(" (%s/%s:%u)", dir_buf, fl_buf, ln);
-       else
-          VG_(printf)(" (%s:%u)", fl_buf, ln);
-    }
+   }
+
+   if (ln > 0) {
+      if (dir_buf[0])
+         VG_(printf)(" (%s/%s:%u)", dir_buf, fl_buf, ln);
+      else
+         VG_(printf)(" (%s:%u)", fl_buf, ln);
+   }
 }
 
 void TG_(print_addr_ln)(Addr addr)
 {
-  TG_(print_addr)(addr);
-  VG_(printf)("\n");
+   TG_(print_addr)(addr);
+   VG_(printf)("\n");
 }
 
 static ULong bb_written = 0;
 
 void TG_(print_bbno)(void)
 {
-  if (bb_written != TG_(stat).bb_executions) {
-    bb_written = TG_(stat).bb_executions;
-    VG_(printf)("BB# %llu\n",TG_(stat).bb_executions);
-  }
+   if (bb_written != TG_(stat).bb_executions) {
+      bb_written = TG_(stat).bb_executions;
+      VG_(printf)("BB# %llu\n", TG_(stat).bb_executions);
+   }
 }
 
 void TG_(print_context)(void)
 {
-  BBCC* bbcc;
-
-  TG_DEBUG(0,"In tid %u [%d] ",
-	   TG_(current_tid),  TG_(current_call_stack).sp);
-  bbcc =  TG_(current_state).bbcc;
-  print_mangled_cxt(TG_(current_state).cxt,
-		    bbcc ? bbcc->rec_index : 0);
-  VG_(printf)("\n");
+   BBCC* bbcc;
+
+   TG_DEBUG(0, "In tid %u [%d] ", TG_(current_tid), TG_(current_call_stack).sp);
+   bbcc = TG_(current_state).bbcc;
+   print_mangled_cxt(TG_(current_state).cxt, bbcc ? bbcc->rec_index : 0);
+   VG_(printf)("\n");
 }
 
 void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f)
 {
-    TG_DEBUG(3, "Malloc(%lu) in %s.\n", s, f);
-    return VG_(malloc)(cc,s);
+   TG_DEBUG(3, "Malloc(%lu) in %s.\n", s, f);
+   return VG_(malloc)(cc, s);
 }
 
 #else /* TG_ENABLE_DEBUG */
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
index 837f69b9f..dadc053c0 100644
--- a/tracegrind/dump.c
+++ b/tracegrind/dump.c
@@ -26,11 +26,11 @@
 
 #include "config.h"
 #include "global.h"
-#include "tg_msgpack.h"
 #include "tg_lz4.h"
+#include "tg_msgpack.h"
 
-#include "pub_tool_threadstate.h"
 #include "pub_tool_libcfile.h"
+#include "pub_tool_threadstate.h"
 
 /* Total reads/writes/misses sum over all threads. */
 FullCost TG_(total_cost) = 0;
@@ -41,23 +41,22 @@ EventMapping* TG_(dumpmap) = 0;
 /* === Trace output                                                === */
 /* ================================================================== */
 
-trace_output TG_(trace_out) = { .fd = -1, .seq = 0,
-                                .initialized = False,
-                                .header_written = False };
+trace_output TG_(trace_out) = {
+   .fd = -1, .seq = 0, .initialized = False, .header_written = False};
 
 /* ================================================================== */
 /* === MsgPack + LZ4 output                                       === */
 /* ================================================================== */
 
-#define MSGPACK_CHUNK_ROWS 4096  /* Rows per compressed chunk */
-#define MSGPACK_INITIAL_BUF (256 * 1024)  /* Initial buffer size */
+#define MSGPACK_CHUNK_ROWS  4096         /* Rows per compressed chunk */
+#define MSGPACK_INITIAL_BUF (256 * 1024) /* Initial buffer size */
 
 typedef struct {
-    msgpack_buffer buf;      /* Buffer for serializing rows */
-    UInt rows_in_chunk;      /* Number of rows in current chunk */
-    UInt n_event_cols;       /* Number of dynamic event columns */
-    const HChar** col_names; /* Column names (for header) */
-    Int ncols;               /* Total columns including events */
+   msgpack_buffer buf;           /* Buffer for serializing rows */
+   UInt           rows_in_chunk; /* Number of rows in current chunk */
+   UInt           n_event_cols;  /* Number of dynamic event columns */
+   const HChar**  col_names;     /* Column names (for header) */
+   Int            ncols;         /* Total columns including events */
 } msgpack_state;
 
 static msgpack_state mp_state;
@@ -65,369 +64,390 @@ static msgpack_state mp_state;
 /* Write a compressed chunk to the trace output */
 static void msgpack_flush_chunk(void)
 {
-    if (mp_state.rows_in_chunk == 0) return;
-    if (TG_(trace_out).fd < 0) return;
-
-    /* Compress the msgpack data with zstd */
-    SizeT src_size = mp_state.buf.size;
-    SizeT dst_capacity = tg_lz4_compress_bound(src_size);
-    UChar* compressed = VG_(malloc)("tg.mp.compress", dst_capacity);
-
-    SizeT compressed_size = tg_lz4_compress(
-        compressed, dst_capacity,
-        mp_state.buf.data, src_size);
-
-    if (compressed_size == 0) {
-        /* Compression failed, write raw with size=0 marker */
-        VG_(free)(compressed);
-        return;
-    }
-
-    /* Write chunk header: 4 bytes uncompressed size, 4 bytes compressed size */
-    UChar hdr[8];
-    hdr[0] = (UChar)(src_size & 0xff);
-    hdr[1] = (UChar)((src_size >> 8) & 0xff);
-    hdr[2] = (UChar)((src_size >> 16) & 0xff);
-    hdr[3] = (UChar)((src_size >> 24) & 0xff);
-    hdr[4] = (UChar)(compressed_size & 0xff);
-    hdr[5] = (UChar)((compressed_size >> 8) & 0xff);
-    hdr[6] = (UChar)((compressed_size >> 16) & 0xff);
-    hdr[7] = (UChar)((compressed_size >> 24) & 0xff);
-    VG_(write)(TG_(trace_out).fd, hdr, 8);
-
-    /* Write compressed data */
-    VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
-
-    VG_(free)(compressed);
-
-    /* Reset buffer for next chunk */
-    msgpack_reset(&mp_state.buf);
-    mp_state.rows_in_chunk = 0;
+   if (mp_state.rows_in_chunk == 0)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   /* Compress the msgpack data with zstd */
+   SizeT  src_size     = mp_state.buf.size;
+   SizeT  dst_capacity = tg_lz4_compress_bound(src_size);
+   UChar* compressed   = VG_(malloc)("tg.mp.compress", dst_capacity);
+
+   SizeT compressed_size =
+      tg_lz4_compress(compressed, dst_capacity, mp_state.buf.data, src_size);
+
+   if (compressed_size == 0) {
+      /* Compression failed, write raw with size=0 marker */
+      VG_(free)(compressed);
+      return;
+   }
+
+   /* Write chunk header: 4 bytes uncompressed size, 4 bytes compressed size */
+   UChar hdr[8];
+   hdr[0] = (UChar)(src_size & 0xff);
+   hdr[1] = (UChar)((src_size >> 8) & 0xff);
+   hdr[2] = (UChar)((src_size >> 16) & 0xff);
+   hdr[3] = (UChar)((src_size >> 24) & 0xff);
+   hdr[4] = (UChar)(compressed_size & 0xff);
+   hdr[5] = (UChar)((compressed_size >> 8) & 0xff);
+   hdr[6] = (UChar)((compressed_size >> 16) & 0xff);
+   hdr[7] = (UChar)((compressed_size >> 24) & 0xff);
+   VG_(write)(TG_(trace_out).fd, hdr, 8);
+
+   /* Write compressed data */
+   VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
+
+   VG_(free)(compressed);
+
+   /* Reset buffer for next chunk */
+   msgpack_reset(&mp_state.buf);
+   mp_state.rows_in_chunk = 0;
 }
 
 /* Write file header with schema metadata (discriminated union format) */
 static void msgpack_write_header(void)
 {
-    msgpack_buffer hdr;
-    msgpack_init(&hdr, 2048);
-
-    /* Header is a map with metadata */
-    msgpack_write_map_header(&hdr, 7);
-
-    /* version */
-    msgpack_write_key(&hdr, "version");
-    msgpack_write_uint(&hdr, 4);
-
-    /* format */
-    msgpack_write_key(&hdr, "format");
-    msgpack_write_str(&hdr, "tracegrind-msgpack", -1);
-
-    /* creator */
-    msgpack_write_key(&hdr, "creator");
-    msgpack_write_str(&hdr, "valgrind-tracegrind", -1);
-
-    /* creator_version */
-    msgpack_write_key(&hdr, "creator_version");
-    msgpack_write_str(&hdr, VERSION, -1);
-
-    /* event_schemas - discriminated union: each event type has its own schema */
-    msgpack_write_key(&hdr, "event_schemas");
-    msgpack_write_map_header(&hdr, 7);  /* 7 event types */
-
-    /* Event type 0 (MARKER) schema */
-    msgpack_write_key(&hdr, "0");
-    msgpack_write_array_header(&hdr, 4);
-    msgpack_write_str(&hdr, "seq", -1);
-    msgpack_write_str(&hdr, "tid", -1);
-    msgpack_write_str(&hdr, "event", -1);
-    msgpack_write_str(&hdr, "marker", -1);
-
-    /* Event types 1-4: 7 fixed columns + "counters" sentinel */
-    {
-        const HChar* ev_keys[] = {"1", "2", "3", "4"};
-        Int k;
-        for (k = 0; k < 4; k++) {
-            msgpack_write_key(&hdr, ev_keys[k]);
-            msgpack_write_array_header(&hdr, 8);
-            msgpack_write_str(&hdr, "seq", -1);
-            msgpack_write_str(&hdr, "tid", -1);
-            msgpack_write_str(&hdr, "event", -1);
-            msgpack_write_str(&hdr, "fn", -1);
-            msgpack_write_str(&hdr, "obj", -1);
-            msgpack_write_str(&hdr, "file", -1);
-            msgpack_write_str(&hdr, "line", -1);
-            msgpack_write_str(&hdr, "counters", -1);
-        }
-    }
-
-    /* Event type 5 (FORK) schema */
-    msgpack_write_key(&hdr, "5");
-    msgpack_write_array_header(&hdr, 4);
-    msgpack_write_str(&hdr, "seq", -1);
-    msgpack_write_str(&hdr, "tid", -1);
-    msgpack_write_str(&hdr, "event", -1);
-    msgpack_write_str(&hdr, "child_pid", -1);
-
-    /* Event type 6 (THREAD_CREATE) schema */
-    msgpack_write_key(&hdr, "6");
-    msgpack_write_array_header(&hdr, 4);
-    msgpack_write_str(&hdr, "seq", -1);
-    msgpack_write_str(&hdr, "tid", -1);
-    msgpack_write_str(&hdr, "event", -1);
-    msgpack_write_str(&hdr, "child_tid", -1);
-
-    /* counters - array of dynamic counter column names */
-    msgpack_write_key(&hdr, "counters");
-    msgpack_write_array_header(&hdr, mp_state.n_event_cols);
-    {
-        Int i;
-        for (i = 7; i < mp_state.ncols; i++) {
-            msgpack_write_str(&hdr, mp_state.col_names[i], -1);
-        }
-    }
-
-    /* counter_units - map from counter name to unit string.
-       Following callgrind's convention: only time counters get units. */
-    msgpack_write_key(&hdr, "counter_units");
-    {
-        Int n_units = 0;
-        const HChar* unit_str = NULL;
-        switch (TG_(clo).collect_systime) {
-            case systime_no:   break;
-            case systime_msec: unit_str = "ms"; n_units = 1; break;
-            case systime_usec: unit_str = "us"; n_units = 1; break;
-            case systime_nsec: unit_str = "ns"; n_units = 2; break;
-        }
-        msgpack_write_map_header(&hdr, n_units);
-        if (unit_str) {
-            msgpack_write_key(&hdr, "sysTime");
+   msgpack_buffer hdr;
+   msgpack_init(&hdr, 2048);
+
+   /* Header is a map with metadata */
+   msgpack_write_map_header(&hdr, 7);
+
+   /* version */
+   msgpack_write_key(&hdr, "version");
+   msgpack_write_uint(&hdr, 4);
+
+   /* format */
+   msgpack_write_key(&hdr, "format");
+   msgpack_write_str(&hdr, "tracegrind-msgpack", -1);
+
+   /* creator */
+   msgpack_write_key(&hdr, "creator");
+   msgpack_write_str(&hdr, "valgrind-tracegrind", -1);
+
+   /* creator_version */
+   msgpack_write_key(&hdr, "creator_version");
+   msgpack_write_str(&hdr, VERSION, -1);
+
+   /* event_schemas - discriminated union: each event type has its own schema */
+   msgpack_write_key(&hdr, "event_schemas");
+   msgpack_write_map_header(&hdr, 7); /* 7 event types */
+
+   /* Event type 0 (MARKER) schema */
+   msgpack_write_key(&hdr, "0");
+   msgpack_write_array_header(&hdr, 4);
+   msgpack_write_str(&hdr, "seq", -1);
+   msgpack_write_str(&hdr, "tid", -1);
+   msgpack_write_str(&hdr, "event", -1);
+   msgpack_write_str(&hdr, "marker", -1);
+
+   /* Event types 1-4: 7 fixed columns + "counters" sentinel */
+   {
+      const HChar* ev_keys[] = {"1", "2", "3", "4"};
+      Int          k;
+      for (k = 0; k < 4; k++) {
+         msgpack_write_key(&hdr, ev_keys[k]);
+         msgpack_write_array_header(&hdr, 8);
+         msgpack_write_str(&hdr, "seq", -1);
+         msgpack_write_str(&hdr, "tid", -1);
+         msgpack_write_str(&hdr, "event", -1);
+         msgpack_write_str(&hdr, "fn", -1);
+         msgpack_write_str(&hdr, "obj", -1);
+         msgpack_write_str(&hdr, "file", -1);
+         msgpack_write_str(&hdr, "line", -1);
+         msgpack_write_str(&hdr, "counters", -1);
+      }
+   }
+
+   /* Event type 5 (FORK) schema */
+   msgpack_write_key(&hdr, "5");
+   msgpack_write_array_header(&hdr, 4);
+   msgpack_write_str(&hdr, "seq", -1);
+   msgpack_write_str(&hdr, "tid", -1);
+   msgpack_write_str(&hdr, "event", -1);
+   msgpack_write_str(&hdr, "child_pid", -1);
+
+   /* Event type 6 (THREAD_CREATE) schema */
+   msgpack_write_key(&hdr, "6");
+   msgpack_write_array_header(&hdr, 4);
+   msgpack_write_str(&hdr, "seq", -1);
+   msgpack_write_str(&hdr, "tid", -1);
+   msgpack_write_str(&hdr, "event", -1);
+   msgpack_write_str(&hdr, "child_tid", -1);
+
+   /* counters - array of dynamic counter column names */
+   msgpack_write_key(&hdr, "counters");
+   msgpack_write_array_header(&hdr, mp_state.n_event_cols);
+   {
+      Int i;
+      for (i = 7; i < mp_state.ncols; i++) {
+         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+      }
+   }
+
+   /* counter_units - map from counter name to unit string.
+      Following callgrind's convention: only time counters get units. */
+   msgpack_write_key(&hdr, "counter_units");
+   {
+      Int          n_units  = 0;
+      const HChar* unit_str = NULL;
+      switch (TG_(clo).collect_systime) {
+      case systime_no:
+         break;
+      case systime_msec:
+         unit_str = "ms";
+         n_units  = 1;
+         break;
+      case systime_usec:
+         unit_str = "us";
+         n_units  = 1;
+         break;
+      case systime_nsec:
+         unit_str = "ns";
+         n_units  = 2;
+         break;
+      }
+      msgpack_write_map_header(&hdr, n_units);
+      if (unit_str) {
+         msgpack_write_key(&hdr, "sysTime");
+         msgpack_write_str(&hdr, unit_str, -1);
+         if (TG_(clo).collect_systime == systime_nsec) {
+            msgpack_write_key(&hdr, "sysCpuTime");
             msgpack_write_str(&hdr, unit_str, -1);
-            if (TG_(clo).collect_systime == systime_nsec) {
-                msgpack_write_key(&hdr, "sysCpuTime");
-                msgpack_write_str(&hdr, unit_str, -1);
-            }
-        }
-    }
-
-    /* Compress and write header chunk */
-    SizeT src_size = hdr.size;
-    SizeT dst_capacity = tg_lz4_compress_bound(src_size);
-    UChar* compressed = VG_(malloc)("tg.mp.hdr", dst_capacity);
-
-    SizeT compressed_size = tg_lz4_compress(
-        compressed, dst_capacity, hdr.data, src_size);
-
-    /* Magic + version (8 bytes): "TGMP" + version(4) - version 4 */
-    UChar magic[8] = {'T', 'G', 'M', 'P', 0x04, 0x00, 0x00, 0x00};
-    VG_(write)(TG_(trace_out).fd, magic, 8);
-
-    /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
-    UChar hdr_size[8];
-    hdr_size[0] = (UChar)(src_size & 0xff);
-    hdr_size[1] = (UChar)((src_size >> 8) & 0xff);
-    hdr_size[2] = (UChar)((src_size >> 16) & 0xff);
-    hdr_size[3] = (UChar)((src_size >> 24) & 0xff);
-    hdr_size[4] = (UChar)(compressed_size & 0xff);
-    hdr_size[5] = (UChar)((compressed_size >> 8) & 0xff);
-    hdr_size[6] = (UChar)((compressed_size >> 16) & 0xff);
-    hdr_size[7] = (UChar)((compressed_size >> 24) & 0xff);
-    VG_(write)(TG_(trace_out).fd, hdr_size, 8);
-
-    /* Compressed header data */
-    VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
-
-    VG_(free)(compressed);
-    msgpack_free(&hdr);
+         }
+      }
+   }
+
+   /* Compress and write header chunk */
+   SizeT  src_size     = hdr.size;
+   SizeT  dst_capacity = tg_lz4_compress_bound(src_size);
+   UChar* compressed   = VG_(malloc)("tg.mp.hdr", dst_capacity);
+
+   SizeT compressed_size =
+      tg_lz4_compress(compressed, dst_capacity, hdr.data, src_size);
+
+   /* Magic + version (8 bytes): "TGMP" + version(4) - version 4 */
+   UChar magic[8] = {'T', 'G', 'M', 'P', 0x04, 0x00, 0x00, 0x00};
+   VG_(write)(TG_(trace_out).fd, magic, 8);
+
+   /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
+   UChar hdr_size[8];
+   hdr_size[0] = (UChar)(src_size & 0xff);
+   hdr_size[1] = (UChar)((src_size >> 8) & 0xff);
+   hdr_size[2] = (UChar)((src_size >> 16) & 0xff);
+   hdr_size[3] = (UChar)((src_size >> 24) & 0xff);
+   hdr_size[4] = (UChar)(compressed_size & 0xff);
+   hdr_size[5] = (UChar)((compressed_size >> 8) & 0xff);
+   hdr_size[6] = (UChar)((compressed_size >> 16) & 0xff);
+   hdr_size[7] = (UChar)((compressed_size >> 24) & 0xff);
+   VG_(write)(TG_(trace_out).fd, hdr_size, 8);
+
+   /* Compressed header data */
+   VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
+
+   VG_(free)(compressed);
+   msgpack_free(&hdr);
 }
 
 /* Initialize msgpack state with schema from event sets */
 static void msgpack_init_state(void)
 {
-    EventSet* es = TG_(sets).full;
-    Int g, i;
-
-    /* Count dynamic event columns */
-    Int n_events = 0;
-    for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
-        if (!(es->mask & (1u << g))) continue;
-        EventGroup* eg = TG_(get_event_group)(g);
-        if (!eg) continue;
-        n_events += eg->size;
-    }
-
-    mp_state.n_event_cols = n_events;
-    mp_state.ncols = 7 + n_events;  /* 7 fixed + dynamic */
-
-    /* Allocate column names array */
-    mp_state.col_names = VG_(malloc)("tg.mp.cols",
-                                      mp_state.ncols * sizeof(HChar*));
-
-    /* Fixed columns */
-    mp_state.col_names[0] = "seq";
-    mp_state.col_names[1] = "tid";
-    mp_state.col_names[2] = "event";
-    mp_state.col_names[3] = "fn";
-    mp_state.col_names[4] = "obj";
-    mp_state.col_names[5] = "file";
-    mp_state.col_names[6] = "line";
-
-    /* Dynamic event columns */
-    Int c = 7;
-    for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
-        if (!(es->mask & (1u << g))) continue;
-        EventGroup* eg = TG_(get_event_group)(g);
-        if (!eg) continue;
-        for (i = 0; i < eg->size; i++) {
-            mp_state.col_names[c++] = eg->name[i];
-        }
-    }
-
-    /* Initialize buffer */
-    msgpack_init(&mp_state.buf, MSGPACK_INITIAL_BUF);
-    mp_state.rows_in_chunk = 0;
-
-    /* Write file header */
-    msgpack_write_header();
+   EventSet* es = TG_(sets).full;
+   Int       g, i;
+
+   /* Count dynamic event columns */
+   Int n_events = 0;
+   for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+      if (!(es->mask & (1u << g)))
+         continue;
+      EventGroup* eg = TG_(get_event_group)(g);
+      if (!eg)
+         continue;
+      n_events += eg->size;
+   }
+
+   mp_state.n_event_cols = n_events;
+   mp_state.ncols        = 7 + n_events; /* 7 fixed + dynamic */
+
+   /* Allocate column names array */
+   mp_state.col_names =
+      VG_(malloc)("tg.mp.cols", mp_state.ncols * sizeof(HChar*));
+
+   /* Fixed columns */
+   mp_state.col_names[0] = "seq";
+   mp_state.col_names[1] = "tid";
+   mp_state.col_names[2] = "event";
+   mp_state.col_names[3] = "fn";
+   mp_state.col_names[4] = "obj";
+   mp_state.col_names[5] = "file";
+   mp_state.col_names[6] = "line";
+
+   /* Dynamic event columns */
+   Int c = 7;
+   for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+      if (!(es->mask & (1u << g)))
+         continue;
+      EventGroup* eg = TG_(get_event_group)(g);
+      if (!eg)
+         continue;
+      for (i = 0; i < eg->size; i++) {
+         mp_state.col_names[c++] = eg->name[i];
+      }
+   }
+
+   /* Initialize buffer */
+   msgpack_init(&mp_state.buf, MSGPACK_INITIAL_BUF);
+   mp_state.rows_in_chunk = 0;
+
+   /* Write file header */
+   msgpack_write_header();
 }
 
 /* Add an ENTER/EXIT row to the msgpack output */
-static void msgpack_add_row(ULong seq, Int tid, Int event,
-                            const HChar* fn_name, Int fn_len,
-                            const HChar* obj_name, Int obj_len,
-                            const HChar* file_name, Int file_len,
-                            Int line,
-                            const ULong* deltas, Int n_deltas)
+static void msgpack_add_row(ULong        seq,
+                            Int          tid,
+                            Int          event,
+                            const HChar* fn_name,
+                            Int          fn_len,
+                            const HChar* obj_name,
+                            Int          obj_len,
+                            const HChar* file_name,
+                            Int          file_len,
+                            Int          line,
+                            const ULong* deltas,
+                            Int          n_deltas)
 {
-    /* Each row is a msgpack array: 7 fixed + 1 counters sub-array */
-    msgpack_write_array_header(&mp_state.buf, 8);
-
-    /* Fixed columns */
-    msgpack_write_uint(&mp_state.buf, seq);
-    msgpack_write_int(&mp_state.buf, tid);
-    msgpack_write_int(&mp_state.buf, event);
-    msgpack_write_str(&mp_state.buf, fn_name, fn_len);
-    msgpack_write_str(&mp_state.buf, obj_name, obj_len);
-    msgpack_write_str(&mp_state.buf, file_name, file_len);
-    msgpack_write_int(&mp_state.buf, line);
-
-    /* Counters sub-array */
-    msgpack_write_array_header(&mp_state.buf, n_deltas);
-    for (Int i = 0; i < n_deltas; i++) {
-        msgpack_write_uint(&mp_state.buf, deltas[i]);
-    }
-
-    mp_state.rows_in_chunk++;
-
-    /* Flush if chunk is full */
-    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
-        msgpack_flush_chunk();
-    }
+   /* Each row is a msgpack array: 7 fixed + 1 counters sub-array */
+   msgpack_write_array_header(&mp_state.buf, 8);
+
+   /* Fixed columns */
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, event);
+   msgpack_write_str(&mp_state.buf, fn_name, fn_len);
+   msgpack_write_str(&mp_state.buf, obj_name, obj_len);
+   msgpack_write_str(&mp_state.buf, file_name, file_len);
+   msgpack_write_int(&mp_state.buf, line);
+
+   /* Counters sub-array */
+   msgpack_write_array_header(&mp_state.buf, n_deltas);
+   for (Int i = 0; i < n_deltas; i++) {
+      msgpack_write_uint(&mp_state.buf, deltas[i]);
+   }
+
+   mp_state.rows_in_chunk++;
+
+   /* Flush if chunk is full */
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
 }
 
-/* Add a FORK row to the msgpack output (minimal schema: seq, tid, event, child_pid) */
+/* Add a FORK row to the msgpack output (minimal schema: seq, tid, event,
+ * child_pid) */
 static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid)
 {
-    /* FORK row is a 4-element array */
-    msgpack_write_array_header(&mp_state.buf, 4);
-    msgpack_write_uint(&mp_state.buf, seq);
-    msgpack_write_int(&mp_state.buf, tid);
-    msgpack_write_int(&mp_state.buf, TG_EV_FORK);
-    msgpack_write_int(&mp_state.buf, child_pid);
-
-    mp_state.rows_in_chunk++;
-
-    /* Flush if chunk is full */
-    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
-        msgpack_flush_chunk();
-    }
+   /* FORK row is a 4-element array */
+   msgpack_write_array_header(&mp_state.buf, 4);
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, TG_EV_FORK);
+   msgpack_write_int(&mp_state.buf, child_pid);
+
+   mp_state.rows_in_chunk++;
+
+   /* Flush if chunk is full */
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
 }
 
 /* Add a THREAD_CREATE row to the msgpack output (seq, tid, event, child_tid) */
 static void msgpack_add_thread_create_row(ULong seq, Int tid, Int child_tid)
 {
-    msgpack_write_array_header(&mp_state.buf, 4);
-    msgpack_write_uint(&mp_state.buf, seq);
-    msgpack_write_int(&mp_state.buf, tid);
-    msgpack_write_int(&mp_state.buf, TG_EV_THREAD_CREATE);
-    msgpack_write_int(&mp_state.buf, child_tid);
+   msgpack_write_array_header(&mp_state.buf, 4);
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, TG_EV_THREAD_CREATE);
+   msgpack_write_int(&mp_state.buf, child_tid);
 
-    mp_state.rows_in_chunk++;
+   mp_state.rows_in_chunk++;
 
-    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
-        msgpack_flush_chunk();
-    }
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
 }
 
 /* Add a MARKER row to the msgpack output (seq, tid, event, marker_str) */
 static void msgpack_add_marker_row(ULong seq, Int tid, const HChar* marker)
 {
-    msgpack_write_array_header(&mp_state.buf, 4);
-    msgpack_write_uint(&mp_state.buf, seq);
-    msgpack_write_int(&mp_state.buf, tid);
-    msgpack_write_int(&mp_state.buf, TG_EV_MARKER);
-    msgpack_write_str(&mp_state.buf, marker, -1);
+   msgpack_write_array_header(&mp_state.buf, 4);
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, TG_EV_MARKER);
+   msgpack_write_str(&mp_state.buf, marker, -1);
 
-    mp_state.rows_in_chunk++;
+   mp_state.rows_in_chunk++;
 
-    if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
-        msgpack_flush_chunk();
-    }
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
 }
 
 /* Close msgpack output */
 static void msgpack_close_output(void)
 {
-    /* Flush any remaining rows */
-    msgpack_flush_chunk();
-
-    /* Write end marker (zero-size chunk) */
-    UChar end[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-    VG_(write)(TG_(trace_out).fd, end, 8);
-
-    /* Cleanup */
-    msgpack_free(&mp_state.buf);
-    if (mp_state.col_names) {
-        VG_(free)(mp_state.col_names);
-        mp_state.col_names = NULL;
-    }
+   /* Flush any remaining rows */
+   msgpack_flush_chunk();
+
+   /* Write end marker (zero-size chunk) */
+   UChar end[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+   VG_(write)(TG_(trace_out).fd, end, 8);
+
+   /* Cleanup */
+   msgpack_free(&mp_state.buf);
+   if (mp_state.col_names) {
+      VG_(free)(mp_state.col_names);
+      mp_state.col_names = NULL;
+   }
 }
 
-
 void TG_(trace_open_output)(void)
 {
-    SysRes res;
-    HChar filename[512];
-
-    if (TG_(trace_out).initialized) return;
-
-    if (!TG_(clo).out_format)
-        TG_(clo).out_format = DEFAULT_OUTFORMAT;
-
-    HChar* expanded = VG_(expand_file_name)("--tracegrind-out-file",
-                                             TG_(clo).out_format);
-    VG_(strncpy)(filename, expanded, sizeof(filename) - 1);
-    filename[sizeof(filename) - 1] = '\0';
-    VG_(free)(expanded);
-
-    res = VG_(open)(filename,
-                    VKI_O_CREAT|VKI_O_WRONLY|VKI_O_TRUNC,
-                    VKI_S_IRUSR|VKI_S_IWUSR);
-    if (sr_isError(res)) {
-        VG_(message)(Vg_UserMsg,
-                     "Error: cannot open trace output file '%s'\n", filename);
-        VG_(exit)(1);
-    }
-
-    TG_(trace_out).fd = (Int)sr_Res(res);
-    TG_(trace_out).seq = 0;
-    TG_(trace_out).initialized = True;
-    TG_(trace_out).header_written = False;
-
-    /* Initialize msgpack writer */
-    msgpack_init_state();
-
-    if (VG_(clo_verbosity) > 1)
-        VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename);
+   SysRes res;
+   HChar  filename[512];
+
+   if (TG_(trace_out).initialized)
+      return;
+
+   if (!TG_(clo).out_format)
+      TG_(clo).out_format = DEFAULT_OUTFORMAT;
+
+   HChar* expanded =
+      VG_(expand_file_name)("--tracegrind-out-file", TG_(clo).out_format);
+   VG_(strncpy)(filename, expanded, sizeof(filename) - 1);
+   filename[sizeof(filename) - 1] = '\0';
+   VG_(free)(expanded);
+
+   res = VG_(open)(filename, VKI_O_CREAT | VKI_O_WRONLY | VKI_O_TRUNC,
+                   VKI_S_IRUSR | VKI_S_IWUSR);
+   if (sr_isError(res)) {
+      VG_(message)(Vg_UserMsg, "Error: cannot open trace output file '%s'\n",
+                   filename);
+      VG_(exit)(1);
+   }
+
+   TG_(trace_out).fd             = (Int)sr_Res(res);
+   TG_(trace_out).seq            = 0;
+   TG_(trace_out).initialized    = True;
+   TG_(trace_out).header_written = False;
+
+   /* Initialize msgpack writer */
+   msgpack_init_state();
+
+   if (VG_(clo_verbosity) > 1)
+      VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename);
 }
 
 /*
@@ -437,212 +457,226 @@ void TG_(trace_open_output)(void)
  */
 void TG_(trace_reopen_child)(void)
 {
-    /* Close inherited fd without flushing/finalizing (that's parent's job) */
-    if (TG_(trace_out).fd >= 0) {
-        VG_(close)(TG_(trace_out).fd);
-    }
-
-    /* Reset state completely */
-    TG_(trace_out).fd = -1;
-    TG_(trace_out).seq = 0;
-    TG_(trace_out).initialized = False;
-    TG_(trace_out).header_written = False;
-
-    /* Open new trace file with child's PID (also re-inits msgpack state) */
-    TG_(trace_open_output)();
+   /* Close inherited fd without flushing/finalizing (that's parent's job) */
+   if (TG_(trace_out).fd >= 0) {
+      VG_(close)(TG_(trace_out).fd);
+   }
+
+   /* Reset state completely */
+   TG_(trace_out).fd             = -1;
+   TG_(trace_out).seq            = 0;
+   TG_(trace_out).initialized    = False;
+   TG_(trace_out).header_written = False;
+
+   /* Open new trace file with child's PID (also re-inits msgpack state) */
+   TG_(trace_open_output)();
 }
 
-void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter,
-                             fn_node* fn)
+void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn)
 {
-    Int i;
-
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
-
-    /* Get current thread info for per-thread last_sample_cost */
-    thread_info* ti = TG_(get_current_thread)();
-    if (!ti) return;
-
-    EventSet* es = TG_(sets).full;
-    FullCost current_cost = TG_(current_state).cost;
-
-    /* If last_sample_cost not yet allocated, allocate and zero it */
-    if (!ti->last_sample_cost) {
-        ti->last_sample_cost = TG_(get_eventset_cost)(es);
-        TG_(init_cost)(es, ti->last_sample_cost);
-    }
-
-    TG_(trace_out).seq++;
-
-    /* Resolve function info with cached lengths */
-    const HChar* fn_name;
-    Int fn_len;
-    const HChar* obj_name;
-    Int obj_len;
-    const HChar* file_name;
-    Int file_len;
-
-    if (fn) {
-        fn_name = fn->name;
-        fn_len = (Int)fn->name_len;
-        if (fn->file) {
-            file_name = fn->file->name;
-            file_len = (Int)fn->file->name_len;
-            if (fn->file->obj) {
-                obj_name = fn->file->obj->name;
-                obj_len = (Int)fn->file->obj->name_len;
-            } else {
-                obj_name = "???"; obj_len = 3;
-            }
-        } else {
-            file_name = "???"; file_len = 3;
-            obj_name = "???"; obj_len = 3;
-        }
-    } else {
-        fn_name = "???"; fn_len = 3;
-        obj_name = "???"; obj_len = 3;
-        file_name = "???"; file_len = 3;
-    }
-
-    /* Compute deltas for all event counters */
-    ULong deltas[64]; /* es->size is always small */
-    tl_assert(es->size <= 64);
-    if (current_cost && ti->last_sample_cost) {
-        for (i = 0; i < es->size; i++) {
-            deltas[i] = current_cost[i] - ti->last_sample_cost[i];
-        }
-        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
-    } else {
-        for (i = 0; i < es->size; i++) {
-            deltas[i] = 0;
-        }
-    }
-
-    Int event_val = is_enter ? TG_EV_ENTER_FN : TG_EV_EXIT_FN;
-
-    msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val,
-                    fn_name, fn_len, obj_name, obj_len,
-                    file_name, file_len, 0,
-                    deltas, es->size);
+   Int i;
+
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   /* Get current thread info for per-thread last_sample_cost */
+   thread_info* ti = TG_(get_current_thread)();
+   if (!ti)
+      return;
+
+   EventSet* es           = TG_(sets).full;
+   FullCost  current_cost = TG_(current_state).cost;
+
+   /* If last_sample_cost not yet allocated, allocate and zero it */
+   if (!ti->last_sample_cost) {
+      ti->last_sample_cost = TG_(get_eventset_cost)(es);
+      TG_(init_cost)(es, ti->last_sample_cost);
+   }
+
+   TG_(trace_out).seq++;
+
+   /* Resolve function info with cached lengths */
+   const HChar* fn_name;
+   Int          fn_len;
+   const HChar* obj_name;
+   Int          obj_len;
+   const HChar* file_name;
+   Int          file_len;
+
+   if (fn) {
+      fn_name = fn->name;
+      fn_len  = (Int)fn->name_len;
+      if (fn->file) {
+         file_name = fn->file->name;
+         file_len  = (Int)fn->file->name_len;
+         if (fn->file->obj) {
+            obj_name = fn->file->obj->name;
+            obj_len  = (Int)fn->file->obj->name_len;
+         } else {
+            obj_name = "???";
+            obj_len  = 3;
+         }
+      } else {
+         file_name = "???";
+         file_len  = 3;
+         obj_name  = "???";
+         obj_len   = 3;
+      }
+   } else {
+      fn_name   = "???";
+      fn_len    = 3;
+      obj_name  = "???";
+      obj_len   = 3;
+      file_name = "???";
+      file_len  = 3;
+   }
+
+   /* Compute deltas for all event counters */
+   ULong deltas[64]; /* es->size is always small */
+   tl_assert(es->size <= 64);
+   if (current_cost && ti->last_sample_cost) {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+      }
+      TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+   } else {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = 0;
+      }
+   }
+
+   Int event_val = is_enter ? TG_EV_ENTER_FN : TG_EV_EXIT_FN;
+
+   msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val, fn_name, fn_len,
+                   obj_name, obj_len, file_name, file_len, 0, deltas, es->size);
 }
 
 void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
 {
-    Int i;
-
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
-
-    thread_info* ti = TG_(get_current_thread)();
-    if (!ti) return;
-
-    EventSet* es = TG_(sets).full;
-    FullCost current_cost = TG_(current_state).cost;
-
-    if (!ti->last_sample_cost) {
-        ti->last_sample_cost = TG_(get_eventset_cost)(es);
-        TG_(init_cost)(es, ti->last_sample_cost);
-    }
-
-    TG_(trace_out).seq++;
-
-    const HChar* fn_name = inl_fn;
-    Int fn_len = -1; /* inlined fn names not cached, use strlen */
-    const HChar* obj_name;
-    Int obj_len;
-    const HChar* file_name;
-    Int file_len;
-
-    if (bb->obj) {
-        obj_name = bb->obj->name;
-        obj_len = (Int)bb->obj->name_len;
-    } else {
-        obj_name = "???"; obj_len = 3;
-    }
-    if (bb->fn && bb->fn->file) {
-        file_name = bb->fn->file->name;
-        file_len = (Int)bb->fn->file->name_len;
-    } else {
-        file_name = "???"; file_len = 3;
-    }
-
-    ULong deltas[64];
-    tl_assert(es->size <= 64);
-    if (current_cost && ti->last_sample_cost) {
-        for (i = 0; i < es->size; i++) {
-            deltas[i] = current_cost[i] - ti->last_sample_cost[i];
-        }
-        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
-    } else {
-        for (i = 0; i < es->size; i++) {
-            deltas[i] = 0;
-        }
-    }
-
-    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED_FN,
-                    fn_name, fn_len, obj_name, obj_len,
-                    file_name, file_len, (Int)bb->line,
-                    deltas, es->size);
+   Int i;
+
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   thread_info* ti = TG_(get_current_thread)();
+   if (!ti)
+      return;
+
+   EventSet* es           = TG_(sets).full;
+   FullCost  current_cost = TG_(current_state).cost;
+
+   if (!ti->last_sample_cost) {
+      ti->last_sample_cost = TG_(get_eventset_cost)(es);
+      TG_(init_cost)(es, ti->last_sample_cost);
+   }
+
+   TG_(trace_out).seq++;
+
+   const HChar* fn_name = inl_fn;
+   Int          fn_len  = -1; /* inlined fn names not cached, use strlen */
+   const HChar* obj_name;
+   Int          obj_len;
+   const HChar* file_name;
+   Int          file_len;
+
+   if (bb->obj) {
+      obj_name = bb->obj->name;
+      obj_len  = (Int)bb->obj->name_len;
+   } else {
+      obj_name = "???";
+      obj_len  = 3;
+   }
+   if (bb->fn && bb->fn->file) {
+      file_name = bb->fn->file->name;
+      file_len  = (Int)bb->fn->file->name_len;
+   } else {
+      file_name = "???";
+      file_len  = 3;
+   }
+
+   ULong deltas[64];
+   tl_assert(es->size <= 64);
+   if (current_cost && ti->last_sample_cost) {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+      }
+      TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+   } else {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = 0;
+      }
+   }
+
+   msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED_FN,
+                   fn_name, fn_len, obj_name, obj_len, file_name, file_len,
+                   (Int)bb->line, deltas, es->size);
 }
 
 void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
 {
-    Int i;
-
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
-
-    thread_info* ti = TG_(get_current_thread)();
-    if (!ti) return;
-
-    EventSet* es = TG_(sets).full;
-    FullCost current_cost = TG_(current_state).cost;
-
-    if (!ti->last_sample_cost) {
-        ti->last_sample_cost = TG_(get_eventset_cost)(es);
-        TG_(init_cost)(es, ti->last_sample_cost);
-    }
-
-    TG_(trace_out).seq++;
-
-    const HChar* fn_name = inl_fn;
-    Int fn_len = -1; /* inlined fn names not cached, use strlen */
-    const HChar* obj_name;
-    Int obj_len;
-    const HChar* file_name;
-    Int file_len;
-
-    if (bb->obj) {
-        obj_name = bb->obj->name;
-        obj_len = (Int)bb->obj->name_len;
-    } else {
-        obj_name = "???"; obj_len = 3;
-    }
-    if (bb->fn && bb->fn->file) {
-        file_name = bb->fn->file->name;
-        file_len = (Int)bb->fn->file->name_len;
-    } else {
-        file_name = "???"; file_len = 3;
-    }
-
-    ULong deltas[64];
-    tl_assert(es->size <= 64);
-    if (current_cost && ti->last_sample_cost) {
-        for (i = 0; i < es->size; i++) {
-            deltas[i] = current_cost[i] - ti->last_sample_cost[i];
-        }
-        TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
-    } else {
-        for (i = 0; i < es->size; i++) {
-            deltas[i] = 0;
-        }
-    }
-
-    msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED_FN,
-                    fn_name, fn_len, obj_name, obj_len,
-                    file_name, file_len, (Int)bb->line,
-                    deltas, es->size);
+   Int i;
+
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   thread_info* ti = TG_(get_current_thread)();
+   if (!ti)
+      return;
+
+   EventSet* es           = TG_(sets).full;
+   FullCost  current_cost = TG_(current_state).cost;
+
+   if (!ti->last_sample_cost) {
+      ti->last_sample_cost = TG_(get_eventset_cost)(es);
+      TG_(init_cost)(es, ti->last_sample_cost);
+   }
+
+   TG_(trace_out).seq++;
+
+   const HChar* fn_name = inl_fn;
+   Int          fn_len  = -1; /* inlined fn names not cached, use strlen */
+   const HChar* obj_name;
+   Int          obj_len;
+   const HChar* file_name;
+   Int          file_len;
+
+   if (bb->obj) {
+      obj_name = bb->obj->name;
+      obj_len  = (Int)bb->obj->name_len;
+   } else {
+      obj_name = "???";
+      obj_len  = 3;
+   }
+   if (bb->fn && bb->fn->file) {
+      file_name = bb->fn->file->name;
+      file_len  = (Int)bb->fn->file->name_len;
+   } else {
+      file_name = "???";
+      file_len  = 3;
+   }
+
+   ULong deltas[64];
+   tl_assert(es->size <= 64);
+   if (current_cost && ti->last_sample_cost) {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+      }
+      TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+   } else {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = 0;
+      }
+   }
+
+   msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED_FN, fn_name,
+                   fn_len, obj_name, obj_len, file_name, file_len,
+                   (Int)bb->line, deltas, es->size);
 }
 
 /*
@@ -652,72 +686,79 @@ void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
  */
 void TG_(trace_emit_fork)(ThreadId tid, Int child_pid)
 {
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
 
-    TG_(trace_out).seq++;
+   TG_(trace_out).seq++;
 
-    /* FORK uses minimal schema: [seq, tid, event, child_pid] */
-    msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid);
+   /* FORK uses minimal schema: [seq, tid, event, child_pid] */
+   msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid);
 }
 
 void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child)
 {
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
 
-    TG_(trace_out).seq++;
+   TG_(trace_out).seq++;
 
-    msgpack_add_thread_create_row(TG_(trace_out).seq, (Int)tid, (Int)child);
+   msgpack_add_thread_create_row(TG_(trace_out).seq, (Int)tid, (Int)child);
 }
 
 void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker)
 {
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
 
-    TG_(trace_out).seq++;
+   TG_(trace_out).seq++;
 
-    msgpack_add_marker_row(TG_(trace_out).seq, (Int)tid, marker);
+   msgpack_add_marker_row(TG_(trace_out).seq, (Int)tid, marker);
 }
 
 void TG_(trace_close_output)(void)
 {
-    if (!TG_(trace_out).initialized) return;
-    if (TG_(trace_out).fd < 0) return;
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
 
-    /* Flush remaining rows, write end marker */
-    msgpack_close_output();
-    VG_(close)(TG_(trace_out).fd);
+   /* Flush remaining rows, write end marker */
+   msgpack_close_output();
+   VG_(close)(TG_(trace_out).fd);
 
-    TG_(trace_out).fd = -1;
-    TG_(trace_out).initialized = False;
+   TG_(trace_out).fd          = -1;
+   TG_(trace_out).initialized = False;
 
-    if (VG_(clo_verbosity) > 1)
-        VG_(message)(Vg_DebugMsg,
-                     "Trace output closed (%llu samples written)\n",
-                     TG_(trace_out).seq);
+   if (VG_(clo_verbosity) > 1)
+      VG_(message)(Vg_DebugMsg, "Trace output closed (%llu samples written)\n",
+                   TG_(trace_out).seq);
 }
 
-
 /* Sum costs from all threads into total_cost */
 void TG_(compute_total_cost)(void)
 {
    if (!TG_(total_cost)) {
-       TG_(total_cost) = TG_(get_eventset_cost)(TG_(sets).full);
-       TG_(init_cost)(TG_(sets).full, TG_(total_cost));
+      TG_(total_cost) = TG_(get_eventset_cost)(TG_(sets).full);
+      TG_(init_cost)(TG_(sets).full, TG_(total_cost));
    }
 
    {
-       Int t;
-       thread_info** thr = TG_(get_threads)();
-       for (t = 1; t < VG_N_THREADS; t++) {
-           if (!thr[t]) continue;
-           TG_(add_diff_cost)(TG_(sets).full, TG_(total_cost),
-                              thr[t]->lastdump_cost,
-                              thr[t]->states.entry[0]->cost);
-           TG_(copy_cost)(TG_(sets).full, thr[t]->lastdump_cost,
-                          thr[t]->states.entry[0]->cost);
-       }
+      Int           t;
+      thread_info** thr = TG_(get_threads)();
+      for (t = 1; t < VG_N_THREADS; t++) {
+         if (!thr[t])
+            continue;
+         TG_(add_diff_cost)
+         (TG_(sets).full, TG_(total_cost), thr[t]->lastdump_cost,
+          thr[t]->states.entry[0]->cost);
+         TG_(copy_cost)
+         (TG_(sets).full, thr[t]->lastdump_cost, thr[t]->states.entry[0]->cost);
+      }
    }
 }
diff --git a/tracegrind/events.c b/tracegrind/events.c
index 3a7820aff..4e91967b8 100644
--- a/tracegrind/events.c
+++ b/tracegrind/events.c
@@ -30,476 +30,495 @@
 #define MAX_EVENTSET_COUNT 1024
 
 static EventGroup* eventGroup[MAX_EVENTGROUP_COUNT];
-static EventSet* eventSetTable[MAX_EVENTSET_COUNT];
-static Bool eventSets_initialized = 0;
+static EventSet*   eventSetTable[MAX_EVENTSET_COUNT];
+static Bool        eventSets_initialized = 0;
 
-static
-void initialize_event_sets(void)
+static void initialize_event_sets(void)
 {
-    Int i;
+   Int i;
 
-    if (eventSets_initialized) return;
+   if (eventSets_initialized)
+      return;
 
-    for(i=0; i< MAX_EVENTGROUP_COUNT; i++)
-	eventGroup[i] = 0;
+   for (i = 0; i < MAX_EVENTGROUP_COUNT; i++)
+      eventGroup[i] = 0;
 
-    for(i=0; i< MAX_EVENTSET_COUNT; i++)
-	eventSetTable[i] = 0; 
+   for (i = 0; i < MAX_EVENTSET_COUNT; i++)
+      eventSetTable[i] = 0;
 
-    eventSets_initialized = 1;
- }
+   eventSets_initialized = 1;
+}
 
-static
-EventGroup* new_event_group(int id, int n)
+static EventGroup* new_event_group(int id, int n)
 {
-    EventGroup* eg;
+   EventGroup* eg;
 
-    initialize_event_sets();
+   initialize_event_sets();
 
-    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
-    TG_ASSERT(eventGroup[id]==0);
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(eventGroup[id] == 0);
 
-    eg = (EventGroup*) TG_MALLOC("cl.events.group.1",
-				  sizeof(EventGroup) + n * sizeof(HChar*));
-    eg->size = n;
-    eventGroup[id] = eg;
-    return eg;
+   eg             = (EventGroup*)TG_MALLOC("cl.events.group.1",
+                                           sizeof(EventGroup) + n * sizeof(HChar*));
+   eg->size       = n;
+   eventGroup[id] = eg;
+   return eg;
 }
 
-EventGroup* TG_(register_event_group) (int id, const HChar* n1)
+EventGroup* TG_(register_event_group)(int id, const HChar* n1)
 {
-    EventGroup* eg = new_event_group(id, 1);
-    eg->name[0] = n1;
+   EventGroup* eg = new_event_group(id, 1);
+   eg->name[0]    = n1;
 
-    return eg;
+   return eg;
 }
 
-EventGroup* TG_(register_event_group2)(int id, const HChar* n1,
-                                        const HChar* n2)
+EventGroup* TG_(register_event_group2)(int id, const HChar* n1, const HChar* n2)
 {
-    EventGroup* eg = new_event_group(id, 2);
-    eg->name[0] = n1;
-    eg->name[1] = n2;
+   EventGroup* eg = new_event_group(id, 2);
+   eg->name[0]    = n1;
+   eg->name[1]    = n2;
 
-    return eg;
+   return eg;
 }
 
-EventGroup* TG_(register_event_group3)(int id, const HChar* n1,
-                                        const HChar* n2, const HChar* n3)
+EventGroup* TG_(register_event_group3)(int          id,
+                                       const HChar* n1,
+                                       const HChar* n2,
+                                       const HChar* n3)
 {
-    EventGroup* eg = new_event_group(id, 3);
-    eg->name[0] = n1;
-    eg->name[1] = n2;
-    eg->name[2] = n3;
+   EventGroup* eg = new_event_group(id, 3);
+   eg->name[0]    = n1;
+   eg->name[1]    = n2;
+   eg->name[2]    = n3;
 
-    return eg;
+   return eg;
 }
 
-EventGroup* TG_(register_event_group4)(int id, const HChar* n1,
-                                        const HChar* n2, const HChar* n3,
-                                        const HChar* n4)
+EventGroup* TG_(register_event_group4)(
+   int id, const HChar* n1, const HChar* n2, const HChar* n3, const HChar* n4)
 {
-    EventGroup* eg = new_event_group(id, 4);
-    eg->name[0] = n1;
-    eg->name[1] = n2;
-    eg->name[2] = n3;
-    eg->name[3] = n4;
+   EventGroup* eg = new_event_group(id, 4);
+   eg->name[0]    = n1;
+   eg->name[1]    = n2;
+   eg->name[2]    = n3;
+   eg->name[3]    = n4;
 
-    return eg;
+   return eg;
 }
 
 EventGroup* TG_(get_event_group)(int id)
 {
-    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
 
-    return eventGroup[id];
+   return eventGroup[id];
 }
 
-
-static
-EventSet* eventset_from_mask(UInt mask)
+static EventSet* eventset_from_mask(UInt mask)
 {
-    EventSet* es;
-    Int i, count, offset;
-
-    if (mask >= MAX_EVENTSET_COUNT) return 0;
-
-    initialize_event_sets();
-    if (eventSetTable[mask]) return eventSetTable[mask];
-
-    es = (EventSet*) TG_MALLOC("cl.events.eventset.1", sizeof(EventSet));
-    es->mask = mask;
-
-    offset = 0;
-    count = 0;
-    for(i=0;i<MAX_EVENTGROUP_COUNT;i++) {
-	es->offset[i] = offset;
-	if ( ((mask & (1u<<i))==0) || (eventGroup[i]==0))
-	    continue;
-
-	offset += eventGroup[i]->size;
-	count++;
-    }
-    es->size = offset;
-    es->count = count;
-
-    eventSetTable[mask] = es;
-    return es;
+   EventSet* es;
+   Int       i, count, offset;
+
+   if (mask >= MAX_EVENTSET_COUNT)
+      return 0;
+
+   initialize_event_sets();
+   if (eventSetTable[mask])
+      return eventSetTable[mask];
+
+   es       = (EventSet*)TG_MALLOC("cl.events.eventset.1", sizeof(EventSet));
+   es->mask = mask;
+
+   offset = 0;
+   count  = 0;
+   for (i = 0; i < MAX_EVENTGROUP_COUNT; i++) {
+      es->offset[i] = offset;
+      if (((mask & (1u << i)) == 0) || (eventGroup[i] == 0))
+         continue;
+
+      offset += eventGroup[i]->size;
+      count++;
+   }
+   es->size  = offset;
+   es->count = count;
+
+   eventSetTable[mask] = es;
+   return es;
 }
 
 EventSet* TG_(get_event_set)(Int id)
 {
-    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
-    return eventset_from_mask(1u << id);
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+   return eventset_from_mask(1u << id);
 }
 
 EventSet* TG_(get_event_set2)(Int id1, Int id2)
 {
-    TG_ASSERT(id1>=0 && id1<MAX_EVENTGROUP_COUNT);
-    TG_ASSERT(id2>=0 && id2<MAX_EVENTGROUP_COUNT);
-    return eventset_from_mask((1u << id1) | (1u << id2));
+   TG_ASSERT(id1 >= 0 && id1 < MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(id2 >= 0 && id2 < MAX_EVENTGROUP_COUNT);
+   return eventset_from_mask((1u << id1) | (1u << id2));
 }
 
 EventSet* TG_(add_event_group)(EventSet* es, Int id)
 {
-    TG_ASSERT(id>=0 && id<MAX_EVENTGROUP_COUNT);
-    if (!es) es = eventset_from_mask(0);
-    return eventset_from_mask(es->mask | (1u << id));
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+   if (!es)
+      es = eventset_from_mask(0);
+   return eventset_from_mask(es->mask | (1u << id));
 }
 
 EventSet* TG_(add_event_group2)(EventSet* es, Int id1, Int id2)
 {
-    TG_ASSERT(id1>=0 && id1<MAX_EVENTGROUP_COUNT);
-    TG_ASSERT(id2>=0 && id2<MAX_EVENTGROUP_COUNT);
-    if (!es) es = eventset_from_mask(0);
-    return eventset_from_mask(es->mask | (1u << id1) | (1u << id2));
+   TG_ASSERT(id1 >= 0 && id1 < MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(id2 >= 0 && id2 < MAX_EVENTGROUP_COUNT);
+   if (!es)
+      es = eventset_from_mask(0);
+   return eventset_from_mask(es->mask | (1u << id1) | (1u << id2));
 }
 
 EventSet* TG_(add_event_set)(EventSet* es1, EventSet* es2)
 {
-    if (!es1) es1 = eventset_from_mask(0);
-    if (!es2) es2 = eventset_from_mask(0);
-    return eventset_from_mask(es1->mask | es2->mask);
+   if (!es1)
+      es1 = eventset_from_mask(0);
+   if (!es2)
+      es2 = eventset_from_mask(0);
+   return eventset_from_mask(es1->mask | es2->mask);
 }
 
-
 /* Get cost array for an event set */
 ULong* TG_(get_eventset_cost)(EventSet* es)
 {
-    return TG_(get_costarray)(es->size);
+   return TG_(get_costarray)(es->size);
 }
 
 /* Set all costs of an event set to zero */
 void TG_(init_cost)(EventSet* es, ULong* cost)
 {
-    Int i;
+   Int i;
 
-    if (!cost) return;
+   if (!cost)
+      return;
 
-    for(i=0; i<es->size; i++)
-	cost[i] = 0;
+   for (i = 0; i < es->size; i++)
+      cost[i] = 0;
 }
 
 /* Set all costs of an event set to zero */
 void TG_(init_cost_lz)(EventSet* es, ULong** cost)
 {
-    Int i;
+   Int i;
 
-    TG_ASSERT(cost != 0);
-    if (!(*cost))
-	*cost = TG_(get_eventset_cost)(es);
+   TG_ASSERT(cost != 0);
+   if (!(*cost))
+      *cost = TG_(get_eventset_cost)(es);
 
-    for(i=0; i<es->size; i++)
-	(*cost)[i] = 0;
+   for (i = 0; i < es->size; i++)
+      (*cost)[i] = 0;
 }
 
 void TG_(zero_cost)(EventSet* es, ULong* cost)
 {
-    Int i;
+   Int i;
 
-    if (!cost) return;
+   if (!cost)
+      return;
 
-    for(i=0;i<es->size;i++)
-	cost[i] = 0;
+   for (i = 0; i < es->size; i++)
+      cost[i] = 0;
 }
-  
+
 Bool TG_(is_zero_cost)(EventSet* es, ULong* cost)
 {
-    Int i;
+   Int i;
 
-    if (!cost) return True;
+   if (!cost)
+      return True;
 
-    for(i=0; i<es->size; i++)
-	if (cost[i] != 0) return False;
+   for (i = 0; i < es->size; i++)
+      if (cost[i] != 0)
+         return False;
 
-    return True;
+   return True;
 }
 
 void TG_(copy_cost)(EventSet* es, ULong* dst, ULong* src)
 {
-    Int i;
-
-    if (!src) {
-	TG_(zero_cost)(es, dst);
-	return;
-    }
-    TG_ASSERT(dst != 0);
-  
-    for(i=0;i<es->size;i++)
-	dst[i] = src[i];
+   Int i;
+
+   if (!src) {
+      TG_(zero_cost)(es, dst);
+      return;
+   }
+   TG_ASSERT(dst != 0);
+
+   for (i = 0; i < es->size; i++)
+      dst[i] = src[i];
 }
 
 void TG_(copy_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
 {
-    Int i;
-    ULong* dst;
-
-    TG_ASSERT(pdst != 0);
-
-    if (!src) {
-	TG_(zero_cost)(es, *pdst);
-	return;
-    }
-    dst = *pdst;
-    if (!dst)
-	dst = *pdst = TG_(get_eventset_cost)(es);
-  
-    for(i=0;i<es->size;i++)
-	dst[i] = src[i];
+   Int    i;
+   ULong* dst;
+
+   TG_ASSERT(pdst != 0);
+
+   if (!src) {
+      TG_(zero_cost)(es, *pdst);
+      return;
+   }
+   dst = *pdst;
+   if (!dst)
+      dst = *pdst = TG_(get_eventset_cost)(es);
+
+   for (i = 0; i < es->size; i++)
+      dst[i] = src[i];
 }
 
 void TG_(add_cost)(EventSet* es, ULong* dst, ULong* src)
 {
-    Int i;
+   Int i;
 
-    if (!src) return;
-    TG_ASSERT(dst != 0);
+   if (!src)
+      return;
+   TG_ASSERT(dst != 0);
 
-    for(i=0; i<es->size; i++)
-	dst[i] += src[i];
+   for (i = 0; i < es->size; i++)
+      dst[i] += src[i];
 }
 
 void TG_(add_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
 {
-    Int i;
-    ULong* dst;
-
-    if (!src) return;
-    TG_ASSERT(pdst != 0);
-
-    dst = *pdst;
-    if (!dst) {
-	dst = *pdst = TG_(get_eventset_cost)(es);
-	TG_(copy_cost)(es, dst, src);
-	return;
-    }
-
-    for(i=0; i<es->size; i++)
-	dst[i] += src[i];
+   Int    i;
+   ULong* dst;
+
+   if (!src)
+      return;
+   TG_ASSERT(pdst != 0);
+
+   dst = *pdst;
+   if (!dst) {
+      dst = *pdst = TG_(get_eventset_cost)(es);
+      TG_(copy_cost)(es, dst, src);
+      return;
+   }
+
+   for (i = 0; i < es->size; i++)
+      dst[i] += src[i];
 }
 
 /* Adds src to dst and zeros src. Returns false if nothing changed */
 Bool TG_(add_and_zero_cost)(EventSet* es, ULong* dst, ULong* src)
 {
-    Int i;
-    Bool is_nonzero = False;
-
-    TG_ASSERT((es != 0) && (dst != 0));
-    if (!src) return False;
-
-    for(i=0; i<es->size; i++) {
-	if (src[i]==0) continue;
-	dst[i] += src[i];
-	src[i] = 0;
-	is_nonzero = True;
-    }
-
-    return is_nonzero;
+   Int  i;
+   Bool is_nonzero = False;
+
+   TG_ASSERT((es != 0) && (dst != 0));
+   if (!src)
+      return False;
+
+   for (i = 0; i < es->size; i++) {
+      if (src[i] == 0)
+         continue;
+      dst[i] += src[i];
+      src[i]     = 0;
+      is_nonzero = True;
+   }
+
+   return is_nonzero;
 }
 
 /* Adds src to dst and zeros src. Returns false if nothing changed */
-Bool TG_(add_and_zero_cost2)(EventSet* esDst, ULong* dst,
-			      EventSet* esSrc, ULong* src)
+Bool TG_(add_and_zero_cost2)(EventSet* esDst,
+                             ULong*    dst,
+                             EventSet* esSrc,
+                             ULong*    src)
 {
-    Int i,j;
-    Bool is_nonzero = False;
-    UInt mask;
-    EventGroup *eg;
-    ULong *egDst, *egSrc;
-
-    TG_ASSERT((esDst != 0) && (dst != 0) && (esSrc != 0));
-    if (!src) return False;
-
-    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
-	if ((esSrc->mask & mask)==0) continue;
-	if (eventGroup[i] ==0) continue;
-
-	/* if src has a subset, dst must have, too */
-	TG_ASSERT((esDst->mask & mask)>0);
-	eg = eventGroup[i];
-	egSrc = src + esSrc->offset[i];
-	egDst = dst + esDst->offset[i];
-	for(j=0; j<eg->size; j++) {
-	    if (egSrc[j]==0) continue;
-	    egDst[j] += egSrc[j];
-	    egSrc[j] = 0;
-	    is_nonzero = True;
-	}
-    }
-
-    return is_nonzero;
+   Int         i, j;
+   Bool        is_nonzero = False;
+   UInt        mask;
+   EventGroup* eg;
+   ULong *     egDst, *egSrc;
+
+   TG_ASSERT((esDst != 0) && (dst != 0) && (esSrc != 0));
+   if (!src)
+      return False;
+
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((esSrc->mask & mask) == 0)
+         continue;
+      if (eventGroup[i] == 0)
+         continue;
+
+      /* if src has a subset, dst must have, too */
+      TG_ASSERT((esDst->mask & mask) > 0);
+      eg    = eventGroup[i];
+      egSrc = src + esSrc->offset[i];
+      egDst = dst + esDst->offset[i];
+      for (j = 0; j < eg->size; j++) {
+         if (egSrc[j] == 0)
+            continue;
+         egDst[j] += egSrc[j];
+         egSrc[j]   = 0;
+         is_nonzero = True;
+      }
+   }
+
+   return is_nonzero;
 }
 
-
-
 /* Adds difference of new and old to dst, and set old to new.
  * Returns false if nothing changed */
 Bool TG_(add_diff_cost)(EventSet* es, ULong* dst, ULong* old, ULong* new_cost)
 {
-    Int i;
-    Bool is_nonzero = False;
+   Int  i;
+   Bool is_nonzero = False;
 
-    TG_ASSERT((es != 0) && (dst != 0));
-    TG_ASSERT(old && new_cost);
+   TG_ASSERT((es != 0) && (dst != 0));
+   TG_ASSERT(old && new_cost);
 
-    for(i=0; i<es->size; i++) {
-	if (new_cost[i] == old[i]) continue;
-	dst[i] += new_cost[i] - old[i];
-	old[i] = new_cost[i];
-	is_nonzero = True;
-    }
+   for (i = 0; i < es->size; i++) {
+      if (new_cost[i] == old[i])
+         continue;
+      dst[i] += new_cost[i] - old[i];
+      old[i]     = new_cost[i];
+      is_nonzero = True;
+   }
 
-    return is_nonzero;
+   return is_nonzero;
 }
 
-Bool TG_(add_diff_cost_lz)(EventSet* es, ULong** pdst, ULong* old, ULong* new_cost)
+Bool TG_(add_diff_cost_lz)(EventSet* es,
+                           ULong**   pdst,
+                           ULong*    old,
+                           ULong*    new_cost)
 {
-    Int i;
-    ULong* dst;
-    Bool is_nonzero = False;
-
-    TG_ASSERT((es != 0) && (pdst != 0));
-    TG_ASSERT(old && new_cost);
-
-    dst = *pdst;
-    if (!dst) {
-	dst = *pdst = TG_(get_eventset_cost)(es);
-	TG_(zero_cost)(es, dst);
-    }
-
-    for(i=0; i<es->size; i++) {
-	if (new_cost[i] == old[i]) continue;
-	dst[i] += new_cost[i] - old[i];
-	old[i] = new_cost[i];
-	is_nonzero = True;
-    }
-
-    return is_nonzero;
+   Int    i;
+   ULong* dst;
+   Bool   is_nonzero = False;
+
+   TG_ASSERT((es != 0) && (pdst != 0));
+   TG_ASSERT(old && new_cost);
+
+   dst = *pdst;
+   if (!dst) {
+      dst = *pdst = TG_(get_eventset_cost)(es);
+      TG_(zero_cost)(es, dst);
+   }
+
+   for (i = 0; i < es->size; i++) {
+      if (new_cost[i] == old[i])
+         continue;
+      dst[i] += new_cost[i] - old[i];
+      old[i]     = new_cost[i];
+      is_nonzero = True;
+   }
+
+   return is_nonzero;
 }
 
-
 /* Allocate space for an event mapping */
 EventMapping* TG_(get_eventmapping)(EventSet* es)
 {
-    EventMapping* em;
+   EventMapping* em;
 
-    TG_ASSERT(es != 0);
+   TG_ASSERT(es != 0);
 
-    em = (EventMapping*) TG_MALLOC("cl.events.geMapping.1",
-				    sizeof(EventMapping) +
-				    sizeof(struct EventMappingEntry) *
-				    es->size);
-    em->capacity = es->size;
-    em->size = 0;
-    em->es = es;
+   em = (EventMapping*)TG_MALLOC(
+      "cl.events.geMapping.1",
+      sizeof(EventMapping) + sizeof(struct EventMappingEntry) * es->size);
+   em->capacity = es->size;
+   em->size     = 0;
+   em->es       = es;
 
-    return em;
+   return em;
 }
 
 void TG_(append_event)(EventMapping* em, const HChar* n)
 {
-    Int i, j, offset = 0;
-    UInt mask;
-    EventGroup* eg;
-
-    TG_ASSERT(em != 0);
-    for(i=0, mask=1; i<MAX_EVENTGROUP_COUNT; i++, mask=mask<<1) {
-	if ((em->es->mask & mask)==0) continue;
-	if (eventGroup[i] ==0) continue;
-
-	eg = eventGroup[i];
-	for(j=0; j<eg->size; j++, offset++) {
-	    if (VG_(strcmp)(n, eg->name[j])!=0)
-		    continue;
-
-	    TG_ASSERT(em->capacity > em->size);
-	    em->entry[em->size].group = i;
-	    em->entry[em->size].index = j;
-	    em->entry[em->size].offset = offset;
-	    em->size++;
-	    return;
-	}
-    }
+   Int         i, j, offset = 0;
+   UInt        mask;
+   EventGroup* eg;
+
+   TG_ASSERT(em != 0);
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((em->es->mask & mask) == 0)
+         continue;
+      if (eventGroup[i] == 0)
+         continue;
+
+      eg = eventGroup[i];
+      for (j = 0; j < eg->size; j++, offset++) {
+         if (VG_(strcmp)(n, eg->name[j]) != 0)
+            continue;
+
+         TG_ASSERT(em->capacity > em->size);
+         em->entry[em->size].group  = i;
+         em->entry[em->size].index  = j;
+         em->entry[em->size].offset = offset;
+         em->size++;
+         return;
+      }
+   }
 }
 
-
 /* Returns pointer to dynamically string. The string will be overwritten
    with each invocation. */
-HChar *TG_(eventmapping_as_string)(const EventMapping* em)
+HChar* TG_(eventmapping_as_string)(const EventMapping* em)
 {
-    Int i;
-    EventGroup* eg;
+   Int         i;
+   EventGroup* eg;
 
-    TG_ASSERT(em != 0);
+   TG_ASSERT(em != 0);
 
-    XArray *xa = VG_(newXA)(VG_(malloc), "cl.events.emas", VG_(free),
-                            sizeof(HChar));
+   XArray* xa =
+      VG_(newXA)(VG_(malloc), "cl.events.emas", VG_(free), sizeof(HChar));
 
-    for(i=0; i< em->size; i++) {
-	if (i > 0) {
-           VG_(xaprintf)(xa, "%c", ' ');
-        }
-	eg = eventGroup[em->entry[i].group];
-	TG_ASSERT(eg != 0);
-        VG_(xaprintf)(xa, "%s", eg->name[em->entry[i].index]);
-    }
-    VG_(xaprintf)(xa, "%c", '\0');   // zero terminate the string
+   for (i = 0; i < em->size; i++) {
+      if (i > 0) {
+         VG_(xaprintf)(xa, "%c", ' ');
+      }
+      eg = eventGroup[em->entry[i].group];
+      TG_ASSERT(eg != 0);
+      VG_(xaprintf)(xa, "%s", eg->name[em->entry[i].index]);
+   }
+   VG_(xaprintf)(xa, "%c", '\0'); // zero terminate the string
 
-    HChar *buf = VG_(strdup)("cl.events.emas", VG_(indexXA)(xa, 0));
-    VG_(deleteXA)(xa);
+   HChar* buf = VG_(strdup)("cl.events.emas", VG_(indexXA)(xa, 0));
+   VG_(deleteXA)(xa);
 
-    return buf;
+   return buf;
 }
 
 /* Returns pointer to dynamically allocated string. Caller needs to
    VG_(free) it. */
-HChar *TG_(mappingcost_as_string)(const EventMapping* em, const ULong* c)
+HChar* TG_(mappingcost_as_string)(const EventMapping* em, const ULong* c)
 {
-    Int i, skipped = 0;
-
-    if (!c || em->size==0) return VG_(strdup)("cl.events.mcas", "");
-
-    XArray *xa = VG_(newXA)(VG_(malloc), "cl.events.mcas", VG_(free),
-                            sizeof(HChar));
-
-    /* At least one entry */
-    VG_(xaprintf)(xa, "%llu", c[em->entry[0].offset]);
-
-    for(i=1; i<em->size; i++) {
-	if (c[em->entry[i].offset] == 0) {
-	    skipped++;
-	    continue;
-	}
-	while(skipped>0) {
-            VG_(xaprintf)(xa, " 0");
-	    skipped--;
-	}
-	VG_(xaprintf)(xa, " %llu", c[em->entry[i].offset]);
-    }
-    VG_(xaprintf)(xa, "%c", '\0');   // zero terminate the string
-
-    HChar *buf = VG_(strdup)("cl.events.mas", VG_(indexXA)(xa, 0));
-    VG_(deleteXA)(xa);
-
-    return buf;
+   Int i, skipped = 0;
+
+   if (!c || em->size == 0)
+      return VG_(strdup)("cl.events.mcas", "");
+
+   XArray* xa =
+      VG_(newXA)(VG_(malloc), "cl.events.mcas", VG_(free), sizeof(HChar));
+
+   /* At least one entry */
+   VG_(xaprintf)(xa, "%llu", c[em->entry[0].offset]);
+
+   for (i = 1; i < em->size; i++) {
+      if (c[em->entry[i].offset] == 0) {
+         skipped++;
+         continue;
+      }
+      while (skipped > 0) {
+         VG_(xaprintf)(xa, " 0");
+         skipped--;
+      }
+      VG_(xaprintf)(xa, " %llu", c[em->entry[i].offset]);
+   }
+   VG_(xaprintf)(xa, "%c", '\0'); // zero terminate the string
+
+   HChar* buf = VG_(strdup)("cl.events.mas", VG_(indexXA)(xa, 0));
+   VG_(deleteXA)(xa);
+
+   return buf;
 }
diff --git a/tracegrind/events.h b/tracegrind/events.h
index 3be144222..bac264c45 100644
--- a/tracegrind/events.h
+++ b/tracegrind/events.h
@@ -31,7 +31,7 @@
 
 #include "pub_tool_basics.h"
 
-#define TG_(str) VGAPPEND(vgTracegrind_,str)
+#define TG_(str) VGAPPEND(vgTracegrind_, str)
 
 /* Event groups consist of one or more named event types.
  * Event sets are constructed from such event groups.
@@ -45,29 +45,29 @@
 
 typedef struct _EventGroup EventGroup;
 struct _EventGroup {
-    Int size;
-    const HChar* name[0];
+   Int          size;
+   const HChar* name[0];
 };
 
 /* return 0 if event group can not be registered */
-EventGroup* TG_(register_event_group) (int id, const HChar*);
+EventGroup* TG_(register_event_group)(int id, const HChar*);
 EventGroup* TG_(register_event_group2)(int id, const HChar*, const HChar*);
-EventGroup* TG_(register_event_group3)(int id, const HChar*, const HChar*,
-                                        const HChar*);
-EventGroup* TG_(register_event_group4)(int id, const HChar*, const HChar*,
-                                        const HChar*, const HChar*);
+EventGroup*
+   TG_(register_event_group3)(int id, const HChar*, const HChar*, const HChar*);
+EventGroup* TG_(register_event_group4)(
+   int id, const HChar*, const HChar*, const HChar*, const HChar*);
 EventGroup* TG_(get_event_group)(int id);
 
 /* Event sets are defined by event groups they consist of. */
 
 typedef struct _EventSet EventSet;
 struct _EventSet {
-    /* if subset with ID x is in the set, then bit x is set */
-    UInt mask;
-    Int count;
-    Int size;
-    Int offset[MAX_EVENTGROUP_COUNT];
- };
+   /* if subset with ID x is in the set, then bit x is set */
+   UInt mask;
+   Int  count;
+   Int  size;
+   Int  offset[MAX_EVENTGROUP_COUNT];
+};
 
 /* Same event set is returned when requesting same event groups */
 EventSet* TG_(get_event_set)(Int id);
@@ -76,56 +76,56 @@ EventSet* TG_(add_event_group)(EventSet*, Int id);
 EventSet* TG_(add_event_group2)(EventSet*, Int id1, Int id2);
 EventSet* TG_(add_event_set)(EventSet*, EventSet*);
 
-
 /* Operations on costs. A cost pointer of 0 means zero cost.
  * Functions ending in _lz allocate cost arrays only when needed
  */
 ULong* TG_(get_eventset_cost)(EventSet*);
 /* Set costs of event set to 0 */
-void TG_(init_cost)(EventSet*,ULong*);
+void TG_(init_cost)(EventSet*, ULong*);
 /* This always allocates counter and sets them to 0 */
-void TG_(init_cost_lz)(EventSet*,ULong**);
+void TG_(init_cost_lz)(EventSet*, ULong**);
 /* Set costs of an event set to zero */
-void TG_(zero_cost)(EventSet*,ULong*);
-Bool TG_(is_zero_cost)(EventSet*,ULong*);
-void TG_(copy_cost)(EventSet*,ULong* dst, ULong* src);
-void TG_(copy_cost_lz)(EventSet*,ULong** pdst, ULong* src);
-void TG_(add_cost)(EventSet*,ULong* dst, ULong* src);
-void TG_(add_cost_lz)(EventSet*,ULong** pdst, ULong* src);
+void TG_(zero_cost)(EventSet*, ULong*);
+Bool TG_(is_zero_cost)(EventSet*, ULong*);
+void TG_(copy_cost)(EventSet*, ULong* dst, ULong* src);
+void TG_(copy_cost_lz)(EventSet*, ULong** pdst, ULong* src);
+void TG_(add_cost)(EventSet*, ULong* dst, ULong* src);
+void TG_(add_cost_lz)(EventSet*, ULong** pdst, ULong* src);
 /* Adds src to dst and zeros src. Returns false if nothing changed */
-Bool TG_(add_and_zero_cost)(EventSet*,ULong* dst, ULong* src);
-Bool TG_(add_and_zero_cost2)(EventSet*,ULong* dst,EventSet*,ULong* src);
+Bool TG_(add_and_zero_cost)(EventSet*, ULong* dst, ULong* src);
+Bool TG_(add_and_zero_cost2)(EventSet*, ULong* dst, EventSet*, ULong* src);
 /* Adds difference of new and old to to dst, and set old to new.
  * Returns false if nothing changed */
-Bool TG_(add_diff_cost)(EventSet*,ULong* dst, ULong* old, ULong* new_cost);
-Bool TG_(add_diff_cost_lz)(EventSet*,ULong** pdst, ULong* old, ULong* new_cost);
+Bool TG_(add_diff_cost)(EventSet*, ULong* dst, ULong* old, ULong* new_cost);
+Bool
+   TG_(add_diff_cost_lz)(EventSet*, ULong** pdst, ULong* old, ULong* new_cost);
 
 /* EventMapping: An ordered subset of events from an event set.
  * This is used to print out part of an EventSet, or in another order.
  */
 struct EventMappingEntry {
-    Int group;
-    Int index;
-    Int offset;
+   Int group;
+   Int index;
+   Int offset;
 };
 typedef struct _EventMapping EventMapping;
 struct _EventMapping {
-  EventSet* es;
-  Int size;
-  Int capacity;
-  struct EventMappingEntry entry[0];
+   EventSet*                es;
+   Int                      size;
+   Int                      capacity;
+   struct EventMappingEntry entry[0];
 };
 
 /* Allocate space for an event mapping */
 EventMapping* TG_(get_eventmapping)(EventSet*);
-void TG_(append_event)(EventMapping*, const HChar*);
+void          TG_(append_event)(EventMapping*, const HChar*);
 /* Returns event mapping as a character string. That string is dynamically
    allocated and it is the caller's responsibility to free it.
    The function never returns NULL. */
-HChar *TG_(eventmapping_as_string)(const EventMapping*);
+HChar* TG_(eventmapping_as_string)(const EventMapping*);
 /* Returns mapping cost as a character string. That string is dynamically
    allocated and it is the caller's responsibility to free it.
    The function never returns NULL. */
-HChar *TG_(mappingcost_as_string)(const EventMapping*, const ULong*);
+HChar* TG_(mappingcost_as_string)(const EventMapping*, const ULong*);
 
 #endif /* TG_EVENTS */
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
index 4c314b296..47702dccc 100644
--- a/tracegrind/fn.c
+++ b/tracegrind/fn.c
@@ -37,81 +37,84 @@ static Addr runtime_resolve_addr[MAX_RESOLVE_ADDRS];
 static int  runtime_resolve_length[MAX_RESOLVE_ADDRS];
 
 // a code pattern is a list of tuples (start offset, length)
-struct chunk_t { int start, len; };
-struct pattern
-{
-    const HChar* name;
-    int len;
-    struct chunk_t chunk[];
+struct chunk_t {
+   int start, len;
+};
+struct pattern {
+   const HChar*   name;
+   int            len;
+   struct chunk_t chunk[];
 };
 
 /* Scan for a pattern in the code of an ELF object.
  * If found, return true and set runtime_resolve_{addr,length}
  */
-__attribute__((unused))    // Possibly;  depends on the platform.
-static Bool check_code(obj_node* obj,
-                       UChar code[], struct pattern* pat)
+__attribute__((unused)) // Possibly;  depends on the platform.
+static Bool
+check_code(obj_node* obj, UChar code[], struct pattern* pat)
 {
-    Bool found;
-    Addr addr, end;
-    int chunk, start, len;
-
-    /* first chunk of pattern should always start at offset 0 and
-     * have at least 3 bytes */
-    TG_ASSERT((pat->chunk[0].start == 0) && (pat->chunk[0].len >2));
-
-    /* and we cannot be called more than MAX_RESOLVE_ADDRS times */
-    TG_ASSERT(runtime_resolve_addrs < MAX_RESOLVE_ADDRS);
-    
-    TG_DEBUG(1, "check_code: %s, pattern %s, check %d bytes of [%x %x %x...]\n",
-              obj->name, pat->name, pat->chunk[0].len, code[0], code[1], code[2]);
-
-    end = obj->start + obj->size - pat->len;
-    addr = obj->start;
-    while(addr < end) {
-	found = (VG_(memcmp)( (void*)addr, code, pat->chunk[0].len) == 0);
-
-        if (found) {
-	    chunk = 1;
-	    while(1) {		
-		start = pat->chunk[chunk].start;
-		len   = pat->chunk[chunk].len;
-		if (len == 0) break;
-
-		TG_ASSERT(len >2);
-                TG_DEBUG(1, " found chunk %d at %#lx, checking %d bytes "
-                             "of [%x %x %x...]\n",
-                          chunk-1, addr - obj->start, len,
-			  code[start], code[start+1], code[start+2]);
-
-                if (VG_(memcmp)( (void*)(addr+start), code+start, len) != 0) {
-                    found = False;
-                    break;
-                }
-		chunk++;
-	    }
-
-            if (found) {
-		TG_DEBUG(1, "found at offset %#lx.\n", addr - obj->start);
-		if (VG_(clo_verbosity) > 1)
-		    VG_(message)(Vg_DebugMsg, "Found runtime_resolve (%s): "
-                                              "%s +%#lx=%#lx, length %d\n",
-				 pat->name, obj->name + obj->last_slash_pos,
-				 addr - obj->start, addr, pat->len);
-		    
-		runtime_resolve_addr[runtime_resolve_addrs] = addr;
-		runtime_resolve_length[runtime_resolve_addrs] = pat->len;
-		runtime_resolve_addrs++;
-		return True;
-	    }
-        }
-        addr++;
-    }
-    TG_DEBUG(1, " found nothing.\n");
-    return False;
+   Bool found;
+   Addr addr, end;
+   int  chunk, start, len;
+
+   /* first chunk of pattern should always start at offset 0 and
+    * have at least 3 bytes */
+   TG_ASSERT((pat->chunk[0].start == 0) && (pat->chunk[0].len > 2));
+
+   /* and we cannot be called more than MAX_RESOLVE_ADDRS times */
+   TG_ASSERT(runtime_resolve_addrs < MAX_RESOLVE_ADDRS);
+
+   TG_DEBUG(1, "check_code: %s, pattern %s, check %d bytes of [%x %x %x...]\n",
+            obj->name, pat->name, pat->chunk[0].len, code[0], code[1], code[2]);
+
+   end  = obj->start + obj->size - pat->len;
+   addr = obj->start;
+   while (addr < end) {
+      found = (VG_(memcmp)((void*)addr, code, pat->chunk[0].len) == 0);
+
+      if (found) {
+         chunk = 1;
+         while (1) {
+            start = pat->chunk[chunk].start;
+            len   = pat->chunk[chunk].len;
+            if (len == 0)
+               break;
+
+            TG_ASSERT(len > 2);
+            TG_DEBUG(1,
+                     " found chunk %d at %#lx, checking %d bytes "
+                     "of [%x %x %x...]\n",
+                     chunk - 1, addr - obj->start, len, code[start],
+                     code[start + 1], code[start + 2]);
+
+            if (VG_(memcmp)((void*)(addr + start), code + start, len) != 0) {
+               found = False;
+               break;
+            }
+            chunk++;
+         }
+
+         if (found) {
+            TG_DEBUG(1, "found at offset %#lx.\n", addr - obj->start);
+            if (VG_(clo_verbosity) > 1)
+               VG_(message)(Vg_DebugMsg,
+                            "Found runtime_resolve (%s): "
+                            "%s +%#lx=%#lx, length %d\n",
+                            pat->name, obj->name + obj->last_slash_pos,
+                            addr - obj->start, addr, pat->len);
+
+            runtime_resolve_addr[runtime_resolve_addrs]   = addr;
+            runtime_resolve_length[runtime_resolve_addrs] = pat->len;
+            runtime_resolve_addrs++;
+            return True;
+         }
+      }
+      addr++;
+   }
+   TG_DEBUG(1, " found nothing.\n");
+   return False;
 }
 
-
 /* _ld_runtime_resolve, located in ld.so, needs special handling:
  * The jump at end into the resolved function should not be
  * represented as a call (as usually done in tracegrind with jumps),
@@ -127,172 +130,192 @@ static Bool check_code(obj_node* obj,
 static Bool search_runtime_resolve(obj_node* obj)
 {
 #if defined(VGP_x86_linux)
-    static UChar code[] = {
-	/* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
-	/* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
-	/*16*/ 0x5a, 0x59, 0x87, 0x04, 0x24, 0xc2, 0x08, 0x00 };
-    /* Check ranges [0-11] and [16-23] ([12-15] is an absolute address) */
-    static struct pattern pat = {
-	"x86-def", 24, {{ 0,12 }, { 16,8 }, { 24,0}} };
-
-    /* Pattern for glibc-2.8 on OpenSuse11.0 */
-    static UChar code_28[] = {
-	/* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
-	/* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
-	/*16*/ 0x5a, 0x8b, 0x0c, 0x24, 0x89, 0x04, 0x24, 0x8b,
-	/*24*/ 0x44, 0x24, 0x04, 0xc2, 0x0c, 0x00 };
-    static struct pattern pat_28 = {
-	"x86-glibc2.8", 30, {{ 0,12 }, { 16,14 }, { 30,0}} };
-
-    if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) return False;
-    Bool pat_p    = check_code(obj, code, &pat);
-    Bool pat_28_p = check_code(obj, code_28, &pat_28);
-    if (pat_p || pat_28_p) return True;
-    return False;
+   static UChar code[] = {
+      /* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+      /* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+      /*16*/ 0x5a, 0x59, 0x87, 0x04, 0x24, 0xc2, 0x08, 0x00};
+   /* Check ranges [0-11] and [16-23] ([12-15] is an absolute address) */
+   static struct pattern pat = {"x86-def", 24, {{0, 12}, {16, 8}, {24, 0}}};
+
+   /* Pattern for glibc-2.8 on OpenSuse11.0 */
+   static UChar code_28[] = {
+      /* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+      /* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+      /*16*/ 0x5a, 0x8b, 0x0c, 0x24, 0x89, 0x04, 0x24, 0x8b,
+      /*24*/ 0x44, 0x24, 0x04, 0xc2, 0x0c, 0x00};
+   static struct pattern pat_28 = {
+      "x86-glibc2.8", 30, {{0, 12}, {16, 14}, {30, 0}}};
+
+   if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0)
+      return False;
+   Bool pat_p    = check_code(obj, code, &pat);
+   Bool pat_28_p = check_code(obj, code_28, &pat_28);
+   if (pat_p || pat_28_p)
+      return True;
+   return False;
 #endif
 
 #if defined(VGP_ppc32_linux)
-    static UChar code[] = {
-	/* 0*/ 0x94, 0x21, 0xff, 0xc0, 0x90, 0x01, 0x00, 0x0c,
-	/* 8*/ 0x90, 0x61, 0x00, 0x10, 0x90, 0x81, 0x00, 0x14,
-	/*16*/ 0x7d, 0x83, 0x63, 0x78, 0x90, 0xa1, 0x00, 0x18,
-	/*24*/ 0x7d, 0x64, 0x5b, 0x78, 0x90, 0xc1, 0x00, 0x1c,
-	/*32*/ 0x7c, 0x08, 0x02, 0xa6, 0x90, 0xe1, 0x00, 0x20,
-	/*40*/ 0x90, 0x01, 0x00, 0x30, 0x91, 0x01, 0x00, 0x24,
-	/*48*/ 0x7c, 0x00, 0x00, 0x26, 0x91, 0x21, 0x00, 0x28,
-	/*56*/ 0x91, 0x41, 0x00, 0x2c, 0x90, 0x01, 0x00, 0x08,
-	/*64*/ 0x48, 0x00, 0x02, 0x91, 0x7c, 0x69, 0x03, 0xa6, /* at 64: bl aff0 <fixup> */
-	/*72*/ 0x80, 0x01, 0x00, 0x30, 0x81, 0x41, 0x00, 0x2c,
-	/*80*/ 0x81, 0x21, 0x00, 0x28, 0x7c, 0x08, 0x03, 0xa6,
-	/*88*/ 0x81, 0x01, 0x00, 0x24, 0x80, 0x01, 0x00, 0x08,
-	/*96*/ 0x80, 0xe1, 0x00, 0x20, 0x80, 0xc1, 0x00, 0x1c,
-	/*104*/0x7c, 0x0f, 0xf1, 0x20, 0x80, 0xa1, 0x00, 0x18,
-	/*112*/0x80, 0x81, 0x00, 0x14, 0x80, 0x61, 0x00, 0x10,
-	/*120*/0x80, 0x01, 0x00, 0x0c, 0x38, 0x21, 0x00, 0x40,
-	/*128*/0x4e, 0x80, 0x04, 0x20 };
-    static struct pattern pat = {
-	"ppc32-def", 132, {{ 0,65 }, { 68,64 }, { 132,0 }} };
-
-    if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) return False;
-    return check_code(obj, code, &pat);
+   static UChar          code[] = {/* 0*/ 0x94,  0x21, 0xff, 0xc0, 0x90,
+                          0x01,         0x00, 0x0c,
+                          /* 8*/ 0x90,  0x61, 0x00, 0x10, 0x90,
+                          0x81,         0x00, 0x14,
+                          /*16*/ 0x7d,  0x83, 0x63, 0x78, 0x90,
+                          0xa1,         0x00, 0x18,
+                          /*24*/ 0x7d,  0x64, 0x5b, 0x78, 0x90,
+                          0xc1,         0x00, 0x1c,
+                          /*32*/ 0x7c,  0x08, 0x02, 0xa6, 0x90,
+                          0xe1,         0x00, 0x20,
+                          /*40*/ 0x90,  0x01, 0x00, 0x30, 0x91,
+                          0x01,         0x00, 0x24,
+                          /*48*/ 0x7c,  0x00, 0x00, 0x26, 0x91,
+                          0x21,         0x00, 0x28,
+                          /*56*/ 0x91,  0x41, 0x00, 0x2c, 0x90,
+                          0x01,         0x00, 0x08,
+                          /*64*/ 0x48,  0x00, 0x02, 0x91, 0x7c,
+                          0x69,         0x03, 0xa6, /* at 64: bl aff0 <fixup> */
+                          /*72*/ 0x80,  0x01, 0x00, 0x30, 0x81,
+                          0x41,         0x00, 0x2c,
+                          /*80*/ 0x81,  0x21, 0x00, 0x28, 0x7c,
+                          0x08,         0x03, 0xa6,
+                          /*88*/ 0x81,  0x01, 0x00, 0x24, 0x80,
+                          0x01,         0x00, 0x08,
+                          /*96*/ 0x80,  0xe1, 0x00, 0x20, 0x80,
+                          0xc1,         0x00, 0x1c,
+                          /*104*/ 0x7c, 0x0f, 0xf1, 0x20, 0x80,
+                          0xa1,         0x00, 0x18,
+                          /*112*/ 0x80, 0x81, 0x00, 0x14, 0x80,
+                          0x61,         0x00, 0x10,
+                          /*120*/ 0x80, 0x01, 0x00, 0x0c, 0x38,
+                          0x21,         0x00, 0x40,
+                          /*128*/ 0x4e, 0x80, 0x04, 0x20};
+   static struct pattern pat    = {
+         "ppc32-def", 132, {{0, 65}, {68, 64}, {132, 0}}};
+
+   if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0)
+      return False;
+   return check_code(obj, code, &pat);
 #endif
 
 #if defined(VGP_amd64_linux)
-    static UChar code[] = {
-	/* 0*/ 0x48, 0x83, 0xec, 0x38, 0x48, 0x89, 0x04, 0x24,
-	/* 8*/ 0x48, 0x89, 0x4c, 0x24, 0x08, 0x48, 0x89, 0x54, 0x24, 0x10,
-	/*18*/ 0x48, 0x89, 0x74, 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20,
-	/*28*/ 0x4c, 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, 0x30,
-	/*38*/ 0x48, 0x8b, 0x74, 0x24, 0x40, 0x49, 0x89, 0xf3,
-	/*46*/ 0x4c, 0x01, 0xde, 0x4c, 0x01, 0xde, 0x48, 0xc1, 0xe6, 0x03,
-	/*56*/ 0x48, 0x8b, 0x7c, 0x24, 0x38, 0xe8, 0xee, 0x01, 0x00, 0x00,
-	/*66*/ 0x49, 0x89, 0xc3, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
-	/*74*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
-	/*84*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, 0x24, 0x10,
-	/*94*/ 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48, 0x8b, 0x04, 0x24,
-	/*103*/0x48, 0x83, 0xc4, 0x48, 0x41, 0xff, 0xe3 };
-    static struct pattern pat = {
-	"amd64-def", 110, {{ 0,62 }, { 66,44 }, { 110,0 }} };
-
-    static UChar code_xsavec[] = {
-	/* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
-	/* 8*/ 0x48, 0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp */
-	/*15*/ 0x48,
-	/*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
-	/*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
-	/*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
-	/*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
-	/*48*/ 0x30, 0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
-	/*56*/ 0x48, 0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
-	/*64*/ 0x48, 0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
-	/*72*/ 0x48, 0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
-	/*80*/ 0x48, 0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
-	/*88*/ 0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
-	/*96*/ 0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
-	/*04*/ 0x0f, 0xc7, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
-	/*112*/0x10, 0x48, 0x8b, 0x7b, 0x08,
-	/*117*/0xe8, 0x00, 0x00, 0x00, 0x00,		/* callq <_dl_fixup> */
-	/*122*/0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
-	/*128*/0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
-	/*136*/0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
-	/*144*/0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
-	/*152*/0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
-	/*160*/0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
-	/*168*/0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
-	/*176*/0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
-	/*184*/0xff, 0xe3 };
-    static struct pattern pat_xsavec = {
-	    "amd64-xsavec", 186, {{ 0,11 }, { 15,103 }, {122,64}, { 186,0 }} };
-
-    static UChar code_xsave[] = {
-	/* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
-	/* 8*/ 0x48, 0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp */
-	/*15*/ 0x48,
-	/*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
-	/*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
-	/*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
-	/*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
-	/*48*/ 0x30, 0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
-	/*56*/ 0x48, 0x89, 0x94, 0x24, 0x40, 0x02, 0x00, 0x00,
-	/*64*/ 0x48, 0x89, 0x94, 0x24, 0x48, 0x02, 0x00, 0x00,
-	/*72*/ 0x48, 0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
-	/*80*/ 0x48, 0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
-	/*88*/ 0x48, 0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
-	/*96*/ 0x48, 0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
-	/*104*/0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
-	/*112*/0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
-	/*120*/0x0f, 0xae, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
-	/*128*/0x10, 0x48, 0x8b, 0x7b, 0x08,
-	/*133*/0xe8, 0x00, 0x00, 0x00, 0x00,		/* callq <_dl_fixup> */
-	/*138*/0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
-	/*144*/0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
-	/*152*/0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
-	/*160*/0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
-	/*168*/0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
-	/*176*/0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
-	/*184*/0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
-	/*192*/0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
-	/*200*/0xff, 0xe3 };
-    static struct pattern pat_xsave = {
-	"amd64-xsave", 202, {{ 0,11 }, { 15,119 }, {138,64}, { 202,0 }} };
-
-    static UChar code_fxsave[] = {
-	/* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xf0,
-	/* 8*/ 0x48, 0x81, 0xec, 0x40, 0x02, 0x00, 0x00, 0x48,
-	/*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
-	/*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
-	/*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
-	/*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
-	/*48*/ 0x30, 0x0f, 0xae, 0x44, 0x24, 0x40, 0x48, 0x8b,
-	/*56*/ 0x73, 0x10, 0x48, 0x8b, 0x7b, 0x08,
-	/*62*/ 0xe8, 0x00, 0x00, 0x00, 0x00,		/* callq <_dl_fixup> */
-	/*67*/ 0x49, 0x89, 0xc3, 0x0f, 0xae,
-	/*72*/ 0x4c, 0x24, 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
-	/*80*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c,
-	/*88*/ 0x24, 0x20, 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48,
-	/*96*/ 0x8b, 0x54, 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24,
-	/*104*/0x08, 0x48, 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc,
-	/*112*/0x48, 0x8b, 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18,
-	/*120*/0xf2, 0x41, 0xff, 0xe3 };
-    static struct pattern pat_fxsave = {
-	"amd64-fxsave", 124, {{ 0,63 }, { 67,57 }, { 124,0 }} };
-
-    if ((VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) &&
-	(VG_(strncmp)(obj->name, "/lib64/ld", 9) != 0) &&
-	(VG_(strncmp)(obj->name, "/usr/lib/ld", 11) != 0) &&
-	(VG_(strncmp)(obj->name, "/usr/lib64/ld", 13) != 0)) return False;
-    Bool pat_p        = check_code(obj, code, &pat);
-    Bool pat_xsavec_p = check_code(obj, code_xsavec, &pat_xsavec);
-    Bool pat_xsave_p  = check_code(obj, code_xsave, &pat_xsave);
-    Bool pat_fxsave_p = check_code(obj, code_fxsave, &pat_fxsave);
-    if (pat_p || pat_xsavec_p || pat_xsave_p || pat_fxsave_p) return True;
+   static UChar code[] = {
+      /* 0*/ 0x48,  0x83, 0xec, 0x38, 0x48, 0x89, 0x04, 0x24,
+      /* 8*/ 0x48,  0x89, 0x4c, 0x24, 0x08, 0x48, 0x89, 0x54, 0x24, 0x10,
+      /*18*/ 0x48,  0x89, 0x74, 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20,
+      /*28*/ 0x4c,  0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, 0x30,
+      /*38*/ 0x48,  0x8b, 0x74, 0x24, 0x40, 0x49, 0x89, 0xf3,
+      /*46*/ 0x4c,  0x01, 0xde, 0x4c, 0x01, 0xde, 0x48, 0xc1, 0xe6, 0x03,
+      /*56*/ 0x48,  0x8b, 0x7c, 0x24, 0x38, 0xe8, 0xee, 0x01, 0x00, 0x00,
+      /*66*/ 0x49,  0x89, 0xc3, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+      /*74*/ 0x4c,  0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+      /*84*/ 0x48,  0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, 0x24, 0x10,
+      /*94*/ 0x48,  0x8b, 0x4c, 0x24, 0x08, 0x48, 0x8b, 0x04, 0x24,
+      /*103*/ 0x48, 0x83, 0xc4, 0x48, 0x41, 0xff, 0xe3};
+   static struct pattern pat = {
+      "amd64-def", 110, {{0, 62}, {66, 44}, {110, 0}}};
+
+   static UChar code_xsavec[] = {
+      /* 0*/ 0x53,  0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
+      /* 8*/ 0x48,  0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp
+                                                         */
+      /*15*/ 0x48,
+      /*16*/ 0x89,  0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+      /*24*/ 0x48,  0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+      /*32*/ 0x24,  0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+      /*40*/ 0x89,  0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+      /*48*/ 0x30,  0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
+      /*56*/ 0x48,  0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
+      /*64*/ 0x48,  0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
+      /*72*/ 0x48,  0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
+      /*80*/ 0x48,  0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
+      /*88*/ 0x48,  0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
+      /*96*/ 0x48,  0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
+      /*04*/ 0x0f,  0xc7, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
+      /*112*/ 0x10, 0x48, 0x8b, 0x7b, 0x08,
+      /*117*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */
+      /*122*/ 0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
+      /*128*/ 0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
+      /*136*/ 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
+      /*144*/ 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+      /*152*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
+      /*160*/ 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
+      /*168*/ 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
+      /*176*/ 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
+      /*184*/ 0xff, 0xe3};
+   static struct pattern pat_xsavec = {
+      "amd64-xsavec", 186, {{0, 11}, {15, 103}, {122, 64}, {186, 0}}};
+
+   static UChar code_xsave[] = {
+      /* 0*/ 0x53,  0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
+      /* 8*/ 0x48,  0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp
+                                                         */
+      /*15*/ 0x48,
+      /*16*/ 0x89,  0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+      /*24*/ 0x48,  0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+      /*32*/ 0x24,  0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+      /*40*/ 0x89,  0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+      /*48*/ 0x30,  0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
+      /*56*/ 0x48,  0x89, 0x94, 0x24, 0x40, 0x02, 0x00, 0x00,
+      /*64*/ 0x48,  0x89, 0x94, 0x24, 0x48, 0x02, 0x00, 0x00,
+      /*72*/ 0x48,  0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
+      /*80*/ 0x48,  0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
+      /*88*/ 0x48,  0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
+      /*96*/ 0x48,  0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
+      /*104*/ 0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
+      /*112*/ 0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
+      /*120*/ 0x0f, 0xae, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
+      /*128*/ 0x10, 0x48, 0x8b, 0x7b, 0x08,
+      /*133*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */
+      /*138*/ 0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
+      /*144*/ 0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
+      /*152*/ 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
+      /*160*/ 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+      /*168*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
+      /*176*/ 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
+      /*184*/ 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
+      /*192*/ 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
+      /*200*/ 0xff, 0xe3};
+   static struct pattern pat_xsave = {
+      "amd64-xsave", 202, {{0, 11}, {15, 119}, {138, 64}, {202, 0}}};
+
+   static UChar code_fxsave[] = {
+      /* 0*/ 0x53,  0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xf0,
+      /* 8*/ 0x48,  0x81, 0xec, 0x40, 0x02, 0x00, 0x00, 0x48,
+      /*16*/ 0x89,  0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+      /*24*/ 0x48,  0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+      /*32*/ 0x24,  0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+      /*40*/ 0x89,  0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+      /*48*/ 0x30,  0x0f, 0xae, 0x44, 0x24, 0x40, 0x48, 0x8b,
+      /*56*/ 0x73,  0x10, 0x48, 0x8b, 0x7b, 0x08,
+      /*62*/ 0xe8,  0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */
+      /*67*/ 0x49,  0x89, 0xc3, 0x0f, 0xae,
+      /*72*/ 0x4c,  0x24, 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+      /*80*/ 0x4c,  0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c,
+      /*88*/ 0x24,  0x20, 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48,
+      /*96*/ 0x8b,  0x54, 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24,
+      /*104*/ 0x08, 0x48, 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc,
+      /*112*/ 0x48, 0x8b, 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18,
+      /*120*/ 0xf2, 0x41, 0xff, 0xe3};
+   static struct pattern pat_fxsave = {
+      "amd64-fxsave", 124, {{0, 63}, {67, 57}, {124, 0}}};
+
+   if ((VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) &&
+       (VG_(strncmp)(obj->name, "/lib64/ld", 9) != 0) &&
+       (VG_(strncmp)(obj->name, "/usr/lib/ld", 11) != 0) &&
+       (VG_(strncmp)(obj->name, "/usr/lib64/ld", 13) != 0))
+      return False;
+   Bool pat_p        = check_code(obj, code, &pat);
+   Bool pat_xsavec_p = check_code(obj, code_xsavec, &pat_xsavec);
+   Bool pat_xsave_p  = check_code(obj, code_xsave, &pat_xsave);
+   Bool pat_fxsave_p = check_code(obj, code_fxsave, &pat_fxsave);
+   if (pat_p || pat_xsavec_p || pat_xsave_p || pat_fxsave_p)
+      return True;
 #endif
 
-    /* For other platforms, no patterns known */
-    return False;
+   /* For other platforms, no patterns known */
+   return False;
 }
 
-
 /*------------------------------------------------------------*/
 /*--- Object/File/Function hash entry operations           ---*/
 /*------------------------------------------------------------*/
@@ -302,421 +325,408 @@ static obj_node* obj_table[N_OBJ_ENTRIES];
 
 void TG_(init_obj_table)(void)
 {
-    Int i;
-    for (i = 0; i < N_OBJ_ENTRIES; i++)
-	obj_table[i] = 0;
+   Int i;
+   for (i = 0; i < N_OBJ_ENTRIES; i++)
+      obj_table[i] = 0;
 }
 
-#define HASH_CONSTANT   256
+#define HASH_CONSTANT 256
 
-static UInt str_hash(const HChar *s, UInt table_size)
+static UInt str_hash(const HChar* s, UInt table_size)
 {
-    int hash_value = 0;
-    for ( ; *s; s++)
-        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
-    return hash_value;
+   int hash_value = 0;
+   for (; *s; s++)
+      hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+   return hash_value;
 }
 
-
 static const HChar* anonymous_obj = "???";
 
-static __inline__ 
-obj_node* new_obj_node(DebugInfo* di, obj_node* next)
+static __inline__ obj_node* new_obj_node(DebugInfo* di, obj_node* next)
 {
-   Int i;
+   Int       i;
    obj_node* obj;
 
-   obj = (obj_node*) TG_MALLOC("cl.fn.non.1", sizeof(obj_node));
-   obj->name  = di ? VG_(strdup)( "cl.fn.non.2",
-                                  VG_(DebugInfo_get_filename)(di) )
-                   : anonymous_obj;
+   obj       = (obj_node*)TG_MALLOC("cl.fn.non.1", sizeof(obj_node));
+   obj->name = di ? VG_(strdup)("cl.fn.non.2", VG_(DebugInfo_get_filename)(di))
+                  : anonymous_obj;
    for (i = 0; i < N_FILE_ENTRIES; i++) {
       obj->files[i] = NULL;
    }
-   TG_(stat).distinct_objs ++;
-   obj->number  = TG_(stat).distinct_objs;
+   TG_(stat).distinct_objs++;
+   obj->number = TG_(stat).distinct_objs;
    /* JRS 2008 Feb 19: maybe rename .start/.size/.offset to
       .text_avma/.text_size/.test_bias to make it clearer what these
       fields really mean */
-   obj->start   = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
-   obj->size    = di ? VG_(DebugInfo_get_text_size)(di) : 0;
-   obj->offset  = di ? VG_(DebugInfo_get_text_bias)(di) : 0;
-   obj->next    = next;
+   obj->start  = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
+   obj->size   = di ? VG_(DebugInfo_get_text_size)(di) : 0;
+   obj->offset = di ? VG_(DebugInfo_get_text_bias)(di) : 0;
+   obj->next   = next;
 
    // not only used for debug output (see static.c)
    obj->last_slash_pos = 0;
-   i = 0;
-   while(obj->name[i]) {
-	if (obj->name[i]=='/') obj->last_slash_pos = i+1;
-	i++;
+   i                   = 0;
+   while (obj->name[i]) {
+      if (obj->name[i] == '/')
+         obj->last_slash_pos = i + 1;
+      i++;
    }
    obj->name_len = i;
 
-   if (runtime_resolve_addrs == 0) search_runtime_resolve(obj);
+   if (runtime_resolve_addrs == 0)
+      search_runtime_resolve(obj);
 
    return obj;
 }
 
 obj_node* TG_(get_obj_node)(DebugInfo* di)
 {
-    obj_node*    curr_obj_node;
-    UInt         objname_hash;
-    const HChar* obj_name;
-    
-    obj_name = di ? VG_(DebugInfo_get_filename)(di) : anonymous_obj;
-
-    /* lookup in obj hash */
-    objname_hash = str_hash(obj_name, N_OBJ_ENTRIES);
-    curr_obj_node = obj_table[objname_hash];
-    while (NULL != curr_obj_node && 
-	   VG_(strcmp)(obj_name, curr_obj_node->name) != 0) {
-	curr_obj_node = curr_obj_node->next;
-    }
-    if (NULL == curr_obj_node) {
-	obj_table[objname_hash] = curr_obj_node = 
-	    new_obj_node(di, obj_table[objname_hash]);
-    }
-
-    return curr_obj_node;
-}
+   obj_node*    curr_obj_node;
+   UInt         objname_hash;
+   const HChar* obj_name;
+
+   obj_name = di ? VG_(DebugInfo_get_filename)(di) : anonymous_obj;
+
+   /* lookup in obj hash */
+   objname_hash  = str_hash(obj_name, N_OBJ_ENTRIES);
+   curr_obj_node = obj_table[objname_hash];
+   while (NULL != curr_obj_node &&
+          VG_(strcmp)(obj_name, curr_obj_node->name) != 0) {
+      curr_obj_node = curr_obj_node->next;
+   }
+   if (NULL == curr_obj_node) {
+      obj_table[objname_hash] = curr_obj_node =
+         new_obj_node(di, obj_table[objname_hash]);
+   }
 
+   return curr_obj_node;
+}
 
-static __inline__ 
-file_node* new_file_node(const HChar *filename,
-			 obj_node* obj, file_node* next)
+static __inline__ file_node*
+new_file_node(const HChar* filename, obj_node* obj, file_node* next)
 {
-  Int i;
-  file_node* file = (file_node*) TG_MALLOC("cl.fn.nfn.1",
-                                           sizeof(file_node));
-  file->name  = VG_(strdup)("cl.fn.nfn.2", filename);
-  file->name_len = VG_(strlen)(filename);
-  for (i = 0; i < N_FN_ENTRIES; i++) {
-    file->fns[i] = NULL;
-  }
-  TG_(stat).distinct_files++;
-  file->obj     = obj;
-  file->next      = next;
-  return file;
+   Int        i;
+   file_node* file = (file_node*)TG_MALLOC("cl.fn.nfn.1", sizeof(file_node));
+   file->name      = VG_(strdup)("cl.fn.nfn.2", filename);
+   file->name_len  = VG_(strlen)(filename);
+   for (i = 0; i < N_FN_ENTRIES; i++) {
+      file->fns[i] = NULL;
+   }
+   TG_(stat).distinct_files++;
+   file->obj  = obj;
+   file->next = next;
+   return file;
 }
 
- 
-file_node* TG_(get_file_node)(obj_node* curr_obj_node,
-                               const HChar *dir, const HChar *file)
+file_node*
+TG_(get_file_node)(obj_node* curr_obj_node, const HChar* dir, const HChar* file)
 {
-    file_node* curr_file_node;
-    UInt       filename_hash;
-
-    /* Build up an absolute pathname, if there is a directory available */
-    HChar filename[VG_(strlen)(dir) + 1 + VG_(strlen)(file) + 1];
-    VG_(strcpy)(filename, dir);
-    if (filename[0] != '\0') {
-       VG_(strcat)(filename, "/");
-    }
-    VG_(strcat)(filename, file);
-
-    /* lookup in file hash */
-    filename_hash = str_hash(filename, N_FILE_ENTRIES);
-    curr_file_node = curr_obj_node->files[filename_hash];
-    while (NULL != curr_file_node && 
-	   VG_(strcmp)(filename, curr_file_node->name) != 0) {
-	curr_file_node = curr_file_node->next;
-    }
-    if (NULL == curr_file_node) {
-	curr_obj_node->files[filename_hash] = curr_file_node = 
-	    new_file_node(filename, curr_obj_node, 
-			  curr_obj_node->files[filename_hash]);
-    }
-
-    return curr_file_node;
+   file_node* curr_file_node;
+   UInt       filename_hash;
+
+   /* Build up an absolute pathname, if there is a directory available */
+   HChar filename[VG_(strlen)(dir) + 1 + VG_(strlen)(file) + 1];
+   VG_(strcpy)(filename, dir);
+   if (filename[0] != '\0') {
+      VG_(strcat)(filename, "/");
+   }
+   VG_(strcat)(filename, file);
+
+   /* lookup in file hash */
+   filename_hash  = str_hash(filename, N_FILE_ENTRIES);
+   curr_file_node = curr_obj_node->files[filename_hash];
+   while (NULL != curr_file_node &&
+          VG_(strcmp)(filename, curr_file_node->name) != 0) {
+      curr_file_node = curr_file_node->next;
+   }
+   if (NULL == curr_file_node) {
+      curr_obj_node->files[filename_hash] = curr_file_node = new_file_node(
+         filename, curr_obj_node, curr_obj_node->files[filename_hash]);
+   }
+
+   return curr_file_node;
 }
 
 /* forward decl. */
 static void resize_fn_array(void);
 
-static __inline__ 
-fn_node* new_fn_node(const HChar *fnname,
-		     file_node* file, fn_node* next)
+static __inline__ fn_node*
+new_fn_node(const HChar* fnname, file_node* file, fn_node* next)
 {
-    fn_node* fn = (fn_node*) TG_MALLOC("cl.fn.nfnnd.1",
-                                         sizeof(fn_node));
-    fn->name = VG_(strdup)("cl.fn.nfnnd.2", fnname);
-    fn->name_len = VG_(strlen)(fnname);
-
-    TG_(stat).distinct_fns++;
-    fn->number   = TG_(stat).distinct_fns;
-    fn->last_cxt = 0;
-    fn->pure_cxt = 0;
-    fn->file     = file;
-    fn->next     = next;
-
-    fn->toggle_collect = False;
-    fn->skip         = False;
-    fn->pop_on_jump  = TG_(clo).pop_on_jump;
-    fn->group        = 0;
-    fn->separate_callers    = TG_(clo).separate_callers;
-    fn->separate_recursions = TG_(clo).separate_recursions;
+   fn_node* fn  = (fn_node*)TG_MALLOC("cl.fn.nfnnd.1", sizeof(fn_node));
+   fn->name     = VG_(strdup)("cl.fn.nfnnd.2", fnname);
+   fn->name_len = VG_(strlen)(fnname);
+
+   TG_(stat).distinct_fns++;
+   fn->number   = TG_(stat).distinct_fns;
+   fn->last_cxt = 0;
+   fn->pure_cxt = 0;
+   fn->file     = file;
+   fn->next     = next;
+
+   fn->toggle_collect      = False;
+   fn->skip                = False;
+   fn->pop_on_jump         = TG_(clo).pop_on_jump;
+   fn->group               = 0;
+   fn->separate_callers    = TG_(clo).separate_callers;
+   fn->separate_recursions = TG_(clo).separate_recursions;
 
 #if TG_ENABLE_DEBUG
-    fn->verbosity    = -1;
+   fn->verbosity = -1;
 #endif
 
-    if (TG_(stat).distinct_fns >= current_fn_active.size)
-	resize_fn_array();
+   if (TG_(stat).distinct_fns >= current_fn_active.size)
+      resize_fn_array();
 
-    return fn;
+   return fn;
 }
 
-
 /* Get a function node in hash2 with known file node.
  * hash nodes are created if needed
  */
-static
-fn_node* get_fn_node_infile(file_node* curr_file_node,
-			    const HChar *fnname)
+static fn_node* get_fn_node_infile(file_node*   curr_file_node,
+                                   const HChar* fnname)
 {
-    fn_node* curr_fn_node;
-    UInt     fnname_hash;
-
-    TG_ASSERT(curr_file_node != 0);
-
-    /* lookup in function hash */
-    fnname_hash = str_hash(fnname, N_FN_ENTRIES);
-    curr_fn_node = curr_file_node->fns[fnname_hash];
-    while (NULL != curr_fn_node && 
-	   VG_(strcmp)(fnname, curr_fn_node->name) != 0) {
-	curr_fn_node = curr_fn_node->next;
-    }
-    if (NULL == curr_fn_node) {
-	curr_file_node->fns[fnname_hash] = curr_fn_node = 
-            new_fn_node(fnname, curr_file_node,
-			curr_file_node->fns[fnname_hash]);
-    }
-
-    return curr_fn_node;
-}
+   fn_node* curr_fn_node;
+   UInt     fnname_hash;
 
+   TG_ASSERT(curr_file_node != 0);
+
+   /* lookup in function hash */
+   fnname_hash  = str_hash(fnname, N_FN_ENTRIES);
+   curr_fn_node = curr_file_node->fns[fnname_hash];
+   while (NULL != curr_fn_node &&
+          VG_(strcmp)(fnname, curr_fn_node->name) != 0) {
+      curr_fn_node = curr_fn_node->next;
+   }
+   if (NULL == curr_fn_node) {
+      curr_file_node->fns[fnname_hash] = curr_fn_node =
+         new_fn_node(fnname, curr_file_node, curr_file_node->fns[fnname_hash]);
+   }
+
+   return curr_fn_node;
+}
 
 /* Get a function node in a Segment.
  * Hash nodes are created if needed.
  */
-static __inline__
-fn_node* get_fn_node_inseg(DebugInfo* di,
-			   const HChar *dirname,
-			   const HChar *filename,
-			   const HChar *fnname)
+static __inline__ fn_node* get_fn_node_inseg(DebugInfo*   di,
+                                             const HChar* dirname,
+                                             const HChar* filename,
+                                             const HChar* fnname)
 {
-  obj_node  *obj  = TG_(get_obj_node)(di);
-  file_node *file = TG_(get_file_node)(obj, dirname, filename);
-  fn_node   *fn   = get_fn_node_infile(file, fnname);
+   obj_node*  obj  = TG_(get_obj_node)(di);
+   file_node* file = TG_(get_file_node)(obj, dirname, filename);
+   fn_node*   fn   = get_fn_node_infile(file, fnname);
 
-  return fn;
+   return fn;
 }
 
-
-Bool TG_(get_debug_info)(Addr instr_addr,
-                          const HChar **dir,
-                          const HChar **file,
-                          const HChar **fn_name, UInt* line_num,
-                          DebugInfo** pDebugInfo)
+Bool TG_(get_debug_info)(Addr          instr_addr,
+                         const HChar** dir,
+                         const HChar** file,
+                         const HChar** fn_name,
+                         UInt*         line_num,
+                         DebugInfo**   pDebugInfo)
 {
-  Bool found_file_line, found_fn, result = True;
-  UInt line;
-  
-  TG_DEBUG(6, "  + get_debug_info(%#lx)\n", instr_addr);
+   Bool found_file_line, found_fn, result = True;
+   UInt line;
 
-  DiEpoch ep = VG_(current_DiEpoch)();
-  if (pDebugInfo) {
+   TG_DEBUG(6, "  + get_debug_info(%#lx)\n", instr_addr);
+
+   DiEpoch ep = VG_(current_DiEpoch)();
+   if (pDebugInfo) {
       *pDebugInfo = VG_(find_DebugInfo)(ep, instr_addr);
 
       // for generated code in anonymous space, pSegInfo is 0
    }
 
-   found_file_line = VG_(get_filename_linenum)(ep, instr_addr,
-					       file,
-					       dir,
-					       &line);
+   found_file_line =
+      VG_(get_filename_linenum)(ep, instr_addr, file, dir, &line);
    found_fn = VG_(get_fnname)(ep, instr_addr, fn_name);
 
    if (!found_file_line && !found_fn) {
-     TG_(stat).no_debug_BBs++;
-     *file = "???";
-     *fn_name = "???";
-     if (line_num) *line_num=0;
-     result = False;
-
-   } else if ( found_file_line &&  found_fn) {
-     TG_(stat).full_debug_BBs++;
-     if (line_num) *line_num=line;
-
-   } else if ( found_file_line && !found_fn) {
-     TG_(stat).file_line_debug_BBs++;
-     *fn_name = "???";
-     if (line_num) *line_num=line;
-
-   } else  /*(!found_file_line &&  found_fn)*/ {
-     TG_(stat).fn_name_debug_BBs++;
-     *file = "???";
-     if (line_num) *line_num=0;
+      TG_(stat).no_debug_BBs++;
+      *file    = "???";
+      *fn_name = "???";
+      if (line_num)
+         *line_num = 0;
+      result = False;
+
+   } else if (found_file_line && found_fn) {
+      TG_(stat).full_debug_BBs++;
+      if (line_num)
+         *line_num = line;
+
+   } else if (found_file_line && !found_fn) {
+      TG_(stat).file_line_debug_BBs++;
+      *fn_name = "???";
+      if (line_num)
+         *line_num = line;
+
+   } else /*(!found_file_line &&  found_fn)*/ {
+      TG_(stat).fn_name_debug_BBs++;
+      *file = "???";
+      if (line_num)
+         *line_num = 0;
    }
 
-   TG_DEBUG(6, "  - get_debug_info(%#lx): seg '%s', fn %s\n",
-	    instr_addr,
-	    !pDebugInfo   ? "-" :
-	    (*pDebugInfo) ? VG_(DebugInfo_get_filename)(*pDebugInfo) :
-	    "(None)",
-	    *fn_name);
+   TG_DEBUG(6, "  - get_debug_info(%#lx): seg '%s', fn %s\n", instr_addr,
+            !pDebugInfo     ? "-"
+            : (*pDebugInfo) ? VG_(DebugInfo_get_filename)(*pDebugInfo)
+                            : "(None)",
+            *fn_name);
 
-  return result;
+   return result;
 }
 
 /* for _libc_freeres_wrapper => _exit renaming */
 static BB* exit_bb = 0;
 
-
 /*
  * Attach function struct to a BB from debug info.
  */
 fn_node* TG_(get_fn_node)(BB* bb)
 {
-    const HChar *fnname, *filename, *dirname;
-    DebugInfo* di;
-    UInt       line_num;
-    fn_node*   fn;
-    Int        i;
-
-    /* fn from debug info is idempotent for a BB */
-    if (bb->fn) return bb->fn;
-
-    TG_DEBUG(3,"+ get_fn_node(BB %#lx)\n", bb_addr(bb));
-
-    /* get function/file name, line number and object of
-     * the BB according to debug information
-     */
-    TG_(get_debug_info)(bb_addr(bb),
-                         &dirname, &filename, &fnname, &line_num, &di);
-
-    DiEpoch ep = VG_(current_DiEpoch)();
-
-    /* Build inline stack for this BB using InlIPCursor */
-    {
-        InlIPCursor* iipc = VG_(new_IIPC)(ep, bb_addr(bb));
-        if (iipc) {
-            const HChar* tmp[TG_MAX_INL_DEPTH + 1];
-            Int total = 0;
-            do {
-                const HChar* fn_name = NULL;
-                VG_(get_fnname_inl)(ep, bb_addr(bb), &fn_name, iipc);
-                if (fn_name && total < TG_MAX_INL_DEPTH + 1)
-                    tmp[total++] = fn_name;
-            } while (VG_(next_IIPC)(iipc));
-            VG_(delete_IIPC)(iipc);
-
-            /* tmp[] is innermost-first; last entry is the non-inlined function (skip it) */
-            Int inl_count = total - 1;
-            if (inl_count > 0) {
-                bb->inl_depth = inl_count;
-                bb->inl_fns = VG_(malloc)("tg.bb.inl", inl_count * sizeof(HChar*));
-                /* Reverse into outermost-first order */
-                for (Int i = 0; i < inl_count; i++)
-                    bb->inl_fns[i] = tmp[inl_count - 1 - i];
-            }
-        }
-    }
-
-    if (0 == VG_(strcmp)(fnname, "???")) {
-	int p;
-        static HChar buf[32];  // for sure large enough
-	/* Use address as found in library */
-	if (sizeof(Addr) == 4)
-          p = VG_(sprintf)(buf, "%#08lx", (UWord)bb->offset);
-	else 	    
-	    // 64bit address
-          p = VG_(sprintf)(buf, "%#016lx", (UWord)bb->offset);
-
-	VG_(sprintf)(buf + p, "%s", 
-		     (bb->sect_kind == Vg_SectData) ? " [Data]" :
-		     (bb->sect_kind == Vg_SectBSS)  ? " [BSS]"  :
-		     (bb->sect_kind == Vg_SectGOT)  ? " [GOT]"  :
-		     (bb->sect_kind == Vg_SectPLT)  ? " [PLT]"  : "");
-        fnname = buf;
-    }
-    else {
+   const HChar *fnname, *filename, *dirname;
+   DebugInfo*   di;
+   UInt         line_num;
+   fn_node*     fn;
+   Int          i;
+
+   /* fn from debug info is idempotent for a BB */
+   if (bb->fn)
+      return bb->fn;
+
+   TG_DEBUG(3, "+ get_fn_node(BB %#lx)\n", bb_addr(bb));
+
+   /* get function/file name, line number and object of
+    * the BB according to debug information
+    */
+   TG_(get_debug_info)
+   (bb_addr(bb), &dirname, &filename, &fnname, &line_num, &di);
+
+   DiEpoch ep = VG_(current_DiEpoch)();
+
+   /* Build inline stack for this BB using InlIPCursor */
+   {
+      InlIPCursor* iipc = VG_(new_IIPC)(ep, bb_addr(bb));
+      if (iipc) {
+         const HChar* tmp[TG_MAX_INL_DEPTH + 1];
+         Int          total = 0;
+         do {
+            const HChar* fn_name = NULL;
+            VG_(get_fnname_inl)(ep, bb_addr(bb), &fn_name, iipc);
+            if (fn_name && total < TG_MAX_INL_DEPTH + 1)
+               tmp[total++] = fn_name;
+         } while (VG_(next_IIPC)(iipc));
+         VG_(delete_IIPC)(iipc);
+
+         /* tmp[] is innermost-first; last entry is the non-inlined function
+          * (skip it) */
+         Int inl_count = total - 1;
+         if (inl_count > 0) {
+            bb->inl_depth = inl_count;
+            bb->inl_fns = VG_(malloc)("tg.bb.inl", inl_count * sizeof(HChar*));
+            /* Reverse into outermost-first order */
+            for (Int i = 0; i < inl_count; i++)
+               bb->inl_fns[i] = tmp[inl_count - 1 - i];
+         }
+      }
+   }
+
+   if (0 == VG_(strcmp)(fnname, "???")) {
+      int          p;
+      static HChar buf[32]; // for sure large enough
+      /* Use address as found in library */
+      if (sizeof(Addr) == 4)
+         p = VG_(sprintf)(buf, "%#08lx", (UWord)bb->offset);
+      else
+         // 64bit address
+         p = VG_(sprintf)(buf, "%#016lx", (UWord)bb->offset);
+
+      VG_(sprintf)(buf + p, "%s",
+                   (bb->sect_kind == Vg_SectData)  ? " [Data]"
+                   : (bb->sect_kind == Vg_SectBSS) ? " [BSS]"
+                   : (bb->sect_kind == Vg_SectGOT) ? " [GOT]"
+                   : (bb->sect_kind == Vg_SectPLT) ? " [PLT]"
+                                                   : "");
+      fnname = buf;
+   } else {
       if (VG_(get_fnname_if_entry)(ep, bb_addr(bb), &fnname))
-	bb->is_entry = 1;
-    }
-
-    /* HACK for correct _exit: 
-     * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind,
-     * so we rename it back again :-)
-     */
-    if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper")
-	&& exit_bb) {
-      TG_(get_debug_info)(bb_addr(exit_bb),
-                           &dirname, &filename, &fnname, &line_num, &di);
-	
-	TG_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n");
-    }
-    if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb)
-	exit_bb = bb;
-
-    for (i = 0; i < runtime_resolve_addrs; i++) {
+         bb->is_entry = 1;
+   }
+
+   /* HACK for correct _exit:
+    * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind,
+    * so we rename it back again :-)
+    */
+   if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper") && exit_bb) {
+      TG_(get_debug_info)
+      (bb_addr(exit_bb), &dirname, &filename, &fnname, &line_num, &di);
+
+      TG_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n");
+   }
+   if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb)
+      exit_bb = bb;
+
+   for (i = 0; i < runtime_resolve_addrs; i++) {
       if ((bb_addr(bb) >= runtime_resolve_addr[i]) &&
-	  (bb_addr(bb) < runtime_resolve_addr[i] + runtime_resolve_length[i])) {
-	  /* BB in runtime_resolve found by code check; use this name */
-	  fnname = "_dl_runtime_resolve";
-	  break;
+          (bb_addr(bb) < runtime_resolve_addr[i] + runtime_resolve_length[i])) {
+         /* BB in runtime_resolve found by code check; use this name */
+         fnname = "_dl_runtime_resolve";
+         break;
       }
-    }
+   }
 
-    /* get fn_node struct for this function */
-    fn = get_fn_node_inseg( di, dirname, filename, fnname);
+   /* get fn_node struct for this function */
+   fn = get_fn_node_inseg(di, dirname, filename, fnname);
 
-    /* if this is the 1st time the function is seen,
-     * some attributes are set */
-    if (fn->pure_cxt == 0) {
+   /* if this is the 1st time the function is seen,
+    * some attributes are set */
+   if (fn->pure_cxt == 0) {
 
       /* Every function gets a "pure" context, i.e. a context with stack
        * depth 1 only with this function. This is for compression of mangled
        * names
        */
       fn_node* pure[2];
-      pure[0] = 0;
-      pure[1] = fn;
-      fn->pure_cxt = TG_(get_cxt)(pure+1);
-
-      if (bb->sect_kind == Vg_SectPLT || bb->sect_kind == Vg_SectPLTSEC)	
-	fn->skip = TG_(clo).skip_plt;
-
-      if (VG_(strncmp)(fn->name, "_dl_runtime_resolve", 19)==0) {
-	  fn->pop_on_jump = True;
-
-	  if (VG_(clo_verbosity) > 1)
-	      VG_(message)(Vg_DebugMsg, "Symbol match: found runtime_resolve:"
-                                        " %s +%#lx=%#lx\n",
-		      bb->obj->name + bb->obj->last_slash_pos,
-                      (UWord)bb->offset, bb_addr(bb));
+      pure[0]      = 0;
+      pure[1]      = fn;
+      fn->pure_cxt = TG_(get_cxt)(pure + 1);
+
+      if (bb->sect_kind == Vg_SectPLT || bb->sect_kind == Vg_SectPLTSEC)
+         fn->skip = TG_(clo).skip_plt;
+
+      if (VG_(strncmp)(fn->name, "_dl_runtime_resolve", 19) == 0) {
+         fn->pop_on_jump = True;
+
+         if (VG_(clo_verbosity) > 1)
+            VG_(message)(Vg_DebugMsg,
+                         "Symbol match: found runtime_resolve:"
+                         " %s +%#lx=%#lx\n",
+                         bb->obj->name + bb->obj->last_slash_pos,
+                         (UWord)bb->offset, bb_addr(bb));
       }
 
       /* apply config options from function name patterns
        * given on command line */
       TG_(update_fn_config)(fn);
-    }
-
+   }
 
-    bb->fn   = fn;
-    bb->line = line_num;
+   bb->fn   = fn;
+   bb->line = line_num;
 
-    if (dirname[0]) {
-       TG_DEBUG(3,"- get_fn_node(BB %#lx): %s (in %s:%u)\n",
-                 bb_addr(bb), fnname, filename, line_num);
-    } else
-       TG_DEBUG(3,"- get_fn_node(BB %#lx): %s (in %s/%s:%u)\n",
-                 bb_addr(bb), fnname, dirname, filename, line_num);
+   if (dirname[0]) {
+      TG_DEBUG(3, "- get_fn_node(BB %#lx): %s (in %s:%u)\n", bb_addr(bb),
+               fnname, filename, line_num);
+   } else
+      TG_DEBUG(3, "- get_fn_node(BB %#lx): %s (in %s/%s:%u)\n", bb_addr(bb),
+               fnname, dirname, filename, line_num);
 
-    return fn;
+   return fn;
 }
 
-
 /*------------------------------------------------------------*/
 /*--- Active function array operations                     ---*/
 /*------------------------------------------------------------*/
@@ -731,47 +741,43 @@ fn_node* TG_(get_fn_node)(BB* bb)
 
 UInt* TG_(get_fn_entry)(Int n)
 {
-  TG_ASSERT(n < current_fn_active.size);
-  return current_fn_active.array + n;
+   TG_ASSERT(n < current_fn_active.size);
+   return current_fn_active.array + n;
 }
 
 void TG_(init_fn_array)(fn_array* a)
 {
-  Int i;
-
-  TG_ASSERT(a != 0);
-
-  a->size = N_INITIAL_FN_ARRAY_SIZE;
-  if (a->size <= TG_(stat).distinct_fns)
-    a->size = TG_(stat).distinct_fns+1;
-  
-  a->array = (UInt*) TG_MALLOC("cl.fn.gfe.1",
-                                a->size * sizeof(UInt));
-  for(i=0;i<a->size;i++)
-    a->array[i] = 0;
+   Int i;
+
+   TG_ASSERT(a != 0);
+
+   a->size = N_INITIAL_FN_ARRAY_SIZE;
+   if (a->size <= TG_(stat).distinct_fns)
+      a->size = TG_(stat).distinct_fns + 1;
+
+   a->array = (UInt*)TG_MALLOC("cl.fn.gfe.1", a->size * sizeof(UInt));
+   for (i = 0; i < a->size; i++)
+      a->array[i] = 0;
 }
 
 void TG_(copy_current_fn_array)(fn_array* dst)
 {
-  TG_ASSERT(dst != 0);
+   TG_ASSERT(dst != 0);
 
-  dst->size  = current_fn_active.size;
-  dst->array = current_fn_active.array;
+   dst->size  = current_fn_active.size;
+   dst->array = current_fn_active.array;
 }
 
-fn_array* TG_(get_current_fn_array)(void)
-{
-  return &current_fn_active;
-}
+fn_array* TG_(get_current_fn_array)(void) { return &current_fn_active; }
 
 void TG_(set_current_fn_array)(fn_array* a)
 {
-  TG_ASSERT(a != 0);
+   TG_ASSERT(a != 0);
 
-  current_fn_active.size  = a->size;
-  current_fn_active.array = a->array;
-  if (current_fn_active.size <= TG_(stat).distinct_fns)
-    resize_fn_array();
+   current_fn_active.size  = a->size;
+   current_fn_active.array = a->array;
+   if (current_fn_active.size <= TG_(stat).distinct_fns)
+      resize_fn_array();
 }
 
 /* ensure that active_array is big enough:
@@ -780,25 +786,24 @@ void TG_(set_current_fn_array)(fn_array* a)
  */
 static void resize_fn_array(void)
 {
-    UInt* new_array;
-    Int i;
+   UInt* new_array;
+   Int   i;
 
-    UInt newsize = current_fn_active.size;
-    while (newsize <= TG_(stat).distinct_fns) newsize *=2;
+   UInt newsize = current_fn_active.size;
+   while (newsize <= TG_(stat).distinct_fns)
+      newsize *= 2;
 
-    TG_DEBUG(0, "Resize fn_active_array: %u => %u\n",
-	     current_fn_active.size, newsize);
+   TG_DEBUG(0, "Resize fn_active_array: %u => %u\n", current_fn_active.size,
+            newsize);
 
-    new_array = (UInt*) TG_MALLOC("cl.fn.rfa.1", newsize * sizeof(UInt));
-    for(i=0;i<current_fn_active.size;i++)
+   new_array = (UInt*)TG_MALLOC("cl.fn.rfa.1", newsize * sizeof(UInt));
+   for (i = 0; i < current_fn_active.size; i++)
       new_array[i] = current_fn_active.array[i];
-    while(i<newsize)
-	new_array[i++] = 0;
+   while (i < newsize)
+      new_array[i++] = 0;
 
-    VG_(free)(current_fn_active.array);
-    current_fn_active.size = newsize;
-    current_fn_active.array = new_array;
-    TG_(stat).fn_array_resizes++;
+   VG_(free)(current_fn_active.array);
+   current_fn_active.size  = newsize;
+   current_fn_active.array = new_array;
+   TG_(stat).fn_array_resizes++;
 }
-
-
diff --git a/tracegrind/global.h b/tracegrind/global.h
index 0d042615e..1c6196c52 100644
--- a/tracegrind/global.h
+++ b/tracegrind/global.h
@@ -29,25 +29,24 @@
 #define TG_GLOBAL
 
 #include "pub_tool_basics.h"
-#include "pub_tool_vki.h"
-#include "pub_tool_vkiscnums.h"
+#include "pub_tool_clientstate.h"
 #include "pub_tool_debuginfo.h"
-#include "pub_tool_libcbase.h"
 #include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
 #include "pub_tool_libcfile.h"
 #include "pub_tool_libcprint.h"
 #include "pub_tool_libcproc.h"
 #include "pub_tool_machine.h"
+#include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
 #include "pub_tool_mallocfree.h"
 #include "pub_tool_options.h"
 #include "pub_tool_tooliface.h"
+#include "pub_tool_vki.h"
+#include "pub_tool_vkiscnums.h"
 #include "pub_tool_xarray.h"
-#include "pub_tool_clientstate.h"
-#include "pub_tool_machine.h"      // VG_(fnptr_to_fnentry)
 
-#include "events.h" // defines TG_ macro
 #include "costs.h"
-
+#include "events.h" // defines TG_ macro
 
 /*------------------------------------------------------------*/
 /*--- Tracegrind compile options                           --- */
@@ -62,12 +61,11 @@
 /* Maximum depth of inline call stack tracking */
 #define TG_MAX_INL_DEPTH 16
 
-
 /*------------------------------------------------------------*/
 /*--- Command line options                                 ---*/
 /*------------------------------------------------------------*/
 
-#define DEFAULT_OUTFORMAT   "tracegrind.out.%p.msgpack.lz4"
+#define DEFAULT_OUTFORMAT "tracegrind.out.%p.msgpack.lz4"
 
 /* If and how to collect syscall time.
    systime_no : do not collect systime
@@ -84,45 +82,45 @@ typedef enum {
 
 /* Trace event types */
 typedef enum {
-   TG_EV_MARKER            = 0,
-   TG_EV_ENTER_FN          = 1,
-   TG_EV_EXIT_FN           = 2,
-   TG_EV_ENTER_INLINED_FN  = 3,
-   TG_EV_EXIT_INLINED_FN   = 4,
-   TG_EV_FORK              = 5,
-   TG_EV_THREAD_CREATE     = 6
+   TG_EV_MARKER           = 0,
+   TG_EV_ENTER_FN         = 1,
+   TG_EV_EXIT_FN          = 2,
+   TG_EV_ENTER_INLINED_FN = 3,
+   TG_EV_EXIT_INLINED_FN  = 4,
+   TG_EV_FORK             = 5,
+   TG_EV_THREAD_CREATE    = 6
 } TraceEventType;
 
 typedef struct _CommandLineOptions CommandLineOptions;
 struct _CommandLineOptions {
 
-  /* Output options */
-  const HChar* out_format;  /* Format string for tracegrind output file name */
+   /* Output options */
+   const HChar* out_format; /* Format string for tracegrind output file name */
 
-  /* Collection options */
-  Bool separate_threads; /* Separate threads in dump? */
-  Int  separate_callers; /* Separate dependent on how many callers? */
-  Int  separate_recursions; /* Max level of recursions to separate */
-  Bool skip_plt;         /* Skip functions in PLT section? */
+   /* Collection options */
+   Bool separate_threads;    /* Separate threads in dump? */
+   Int  separate_callers;    /* Separate dependent on how many callers? */
+   Int  separate_recursions; /* Max level of recursions to separate */
+   Bool skip_plt;            /* Skip functions in PLT section? */
 
-  Bool collect_atstart;  /* Start in collecting state ? */
-  Bool collect_jumps;    /* Collect (cond.) jumps in functions ? */
+   Bool collect_atstart; /* Start in collecting state ? */
+   Bool collect_jumps;   /* Collect (cond.) jumps in functions ? */
 
-  Collect_Systime collect_systime;  /* Collect time for system calls */
+   Collect_Systime collect_systime; /* Collect time for system calls */
 
-  Bool collect_bus;      /* Collect global bus events */
+   Bool collect_bus; /* Collect global bus events */
 
-  /* Instrument options */
-  Bool instrument_atstart;  /* Instrument at start? */
-  Bool simulate_cache;      /* Call into cache simulator ? */
-  Bool simulate_branch;     /* Call into branch prediction simulator ? */
+   /* Instrument options */
+   Bool instrument_atstart; /* Instrument at start? */
+   Bool simulate_cache;     /* Call into cache simulator ? */
+   Bool simulate_branch;    /* Call into branch prediction simulator ? */
 
-  /* Call graph generation */
-  Bool pop_on_jump;       /* Handle a jump between functions as ret+call */
+   /* Call graph generation */
+   Bool pop_on_jump; /* Handle a jump between functions as ret+call */
 
 #if TG_ENABLE_DEBUG
-  Int   verbose;
-  ULong verbose_start;
+   Int   verbose;
+   ULong verbose_start;
 #endif
 };
 
@@ -131,8 +129,7 @@ struct _CommandLineOptions {
 /*------------------------------------------------------------*/
 
 /* Minimum cache line size allowed */
-#define MIN_LINE_SIZE   16
-
+#define MIN_LINE_SIZE 16
 
 /*------------------------------------------------------------*/
 /*--- Statistics                                           ---*/
@@ -140,45 +137,44 @@ struct _CommandLineOptions {
 
 typedef struct _Statistics Statistics;
 struct _Statistics {
-  ULong call_counter;
-  ULong jcnd_counter;
-  ULong jump_counter;
-  ULong rec_call_counter;
-  ULong ret_counter;
-  ULong bb_executions;
-
-  Int  context_counter;
-  Int  bb_retranslations;
-
-  Int  distinct_objs;
-  Int  distinct_files;
-  Int  distinct_fns;
-  Int  distinct_contexts;
-  Int  distinct_bbs;
-  Int  distinct_jccs;
-  Int  distinct_bbccs;
-  Int  distinct_instrs;
-  Int  distinct_skips;
-
-  Int  bb_hash_resizes;
-  Int  bbcc_hash_resizes;
-  Int  jcc_hash_resizes;
-  Int  cxt_hash_resizes;
-  Int  fn_array_resizes;
-  Int  call_stack_resizes;
-  Int  fn_stack_resizes;
-
-  Int  full_debug_BBs;
-  Int  file_line_debug_BBs;
-  Int  fn_name_debug_BBs;
-  Int  no_debug_BBs;
-  Int  bbcc_lru_misses;
-  Int  jcc_lru_misses;
-  Int  cxt_lru_misses;
-  Int  bbcc_clones;
+   ULong call_counter;
+   ULong jcnd_counter;
+   ULong jump_counter;
+   ULong rec_call_counter;
+   ULong ret_counter;
+   ULong bb_executions;
+
+   Int context_counter;
+   Int bb_retranslations;
+
+   Int distinct_objs;
+   Int distinct_files;
+   Int distinct_fns;
+   Int distinct_contexts;
+   Int distinct_bbs;
+   Int distinct_jccs;
+   Int distinct_bbccs;
+   Int distinct_instrs;
+   Int distinct_skips;
+
+   Int bb_hash_resizes;
+   Int bbcc_hash_resizes;
+   Int jcc_hash_resizes;
+   Int cxt_hash_resizes;
+   Int fn_array_resizes;
+   Int call_stack_resizes;
+   Int fn_stack_resizes;
+
+   Int full_debug_BBs;
+   Int file_line_debug_BBs;
+   Int fn_name_debug_BBs;
+   Int no_debug_BBs;
+   Int bbcc_lru_misses;
+   Int jcc_lru_misses;
+   Int cxt_lru_misses;
+   Int bbcc_clones;
 };
 
-
 /*------------------------------------------------------------*/
 /*--- Structure declarations                               ---*/
 /*------------------------------------------------------------*/
@@ -198,19 +194,17 @@ typedef struct _thread_info thread_info;
 /* Cost arrays: aliases to arrays of 64-bit event counters */
 typedef ULong* FullCost;
 
-
 /* The types of control flow changes that can happen between
  * execution of two BBs in a thread.
  */
 typedef enum {
-  jk_None = 0,   /* no explicit change by a guest instruction */
-  jk_Jump,       /* regular jump */
-  jk_Call,
-  jk_Return,
-  jk_CondJump    /* conditional jump taken (only used as jCC type) */
+   jk_None = 0, /* no explicit change by a guest instruction */
+   jk_Jump,     /* regular jump */
+   jk_Call,
+   jk_Return,
+   jk_CondJump /* conditional jump taken (only used as jCC type) */
 } TgJumpKind;
 
-
 /* JmpCall cost center
  * for subroutine call (from->bb->jmp_addr => to->bb->addr)
  *
@@ -232,41 +226,37 @@ typedef enum {
  */
 
 struct _jCC {
-  TgJumpKind jmpkind; /* jk_Call, jk_Jump, jk_CondJump */
-  jCC* next_hash;   /* for hash entry chain */
-  jCC* next_from;   /* next JCC from a BBCC */
-  BBCC *from, *to;  /* call arc from/to this BBCC */
-  UInt jmp;         /* jump no. in source */
+   TgJumpKind jmpkind;   /* jk_Call, jk_Jump, jk_CondJump */
+   jCC*       next_hash; /* for hash entry chain */
+   jCC*       next_from; /* next JCC from a BBCC */
+   BBCC *     from, *to; /* call arc from/to this BBCC */
+   UInt       jmp;       /* jump no. in source */
 
-  ULong call_counter; /* no wraparound with 64 bit */
+   ULong call_counter; /* no wraparound with 64 bit */
 
-  FullCost cost; /* simulator + user counters */
+   FullCost cost; /* simulator + user counters */
 };
 
-
 /*
  * Info for one instruction of a basic block.
  */
 typedef struct _InstrInfo InstrInfo;
 struct _InstrInfo {
-  UInt instr_offset;
-  UInt instr_size;
-  UInt cost_offset;
-  EventSet* eventset;
+   UInt      instr_offset;
+   UInt      instr_size;
+   UInt      cost_offset;
+   EventSet* eventset;
 };
 
-
-
 /*
  * Info for a side exit in a BB
  */
 typedef struct _CJmpInfo CJmpInfo;
 struct _CJmpInfo {
-  UInt instr;          /* instruction index for BB.instr array */
-  TgJumpKind jmpkind; /* jump kind when leaving BB at this side exit */
+   UInt       instr;   /* instruction index for BB.instr array */
+   TgJumpKind jmpkind; /* jump kind when leaving BB at this side exit */
 };
 
-
 /**
  * An instrumented basic block (BB).
  *
@@ -284,37 +274,36 @@ struct _CJmpInfo {
  * BBCC is set by setup_bbcc.
  */
 struct _BB {
-  obj_node*  obj;         /* ELF object of BB */
-  PtrdiffT   offset;      /* offset of BB in ELF object file */
-  BB*        next;       /* chaining for a hash entry */
-
-  VgSectKind sect_kind;  /* section of this BB, e.g. PLT */
-  UInt       instr_count;
-
-  /* filled by TG_(get_fn_node) if debug info is available */
-  fn_node*   fn;          /* debug info for this BB */
-  UInt       line;
-  Bool       is_entry;    /* True if this BB is a function entry */
-
-  BBCC*      bbcc_list;  /* BBCCs for same BB (see next_bbcc in BBCC) */
-  BBCC*      last_bbcc;  /* Temporary: Cached for faster access (LRU) */
-
-  /* filled by TG_(instrument) if not seen before */
-  UInt       cjmp_count;  /* number of side exits */
-  CJmpInfo*  jmp;         /* array of info for condition jumps,
-			   * allocated directly after this struct */
-  Bool       cjmp_inverted; /* is last side exit actually fall through? */
-
-  const HChar** inl_fns;  /* inlined fn names at BB start (outermost first), or NULL */
-  UInt       inl_depth;  /* number of entries in inl_fns */
-
-  UInt       instr_len;
-  UInt       cost_count;
-  InstrInfo  instr[0];   /* info on instruction sizes and costs */
+   obj_node* obj;    /* ELF object of BB */
+   PtrdiffT  offset; /* offset of BB in ELF object file */
+   BB*       next;   /* chaining for a hash entry */
+
+   VgSectKind sect_kind; /* section of this BB, e.g. PLT */
+   UInt       instr_count;
+
+   /* filled by TG_(get_fn_node) if debug info is available */
+   fn_node* fn; /* debug info for this BB */
+   UInt     line;
+   Bool     is_entry; /* True if this BB is a function entry */
+
+   BBCC* bbcc_list; /* BBCCs for same BB (see next_bbcc in BBCC) */
+   BBCC* last_bbcc; /* Temporary: Cached for faster access (LRU) */
+
+   /* filled by TG_(instrument) if not seen before */
+   UInt      cjmp_count; /* number of side exits */
+   CJmpInfo* jmp;        /* array of info for condition jumps,
+                          * allocated directly after this struct */
+   Bool cjmp_inverted;   /* is last side exit actually fall through? */
+
+   const HChar**
+        inl_fns;   /* inlined fn names at BB start (outermost first), or NULL */
+   UInt inl_depth; /* number of entries in inl_fns */
+
+   UInt      instr_len;
+   UInt      cost_count;
+   InstrInfo instr[0]; /* info on instruction sizes and costs */
 };
 
-
-
 /**
  * Function context
  *
@@ -331,24 +320,22 @@ struct _BB {
  * For each Context, recursion index and BB, there can be a BBCC.
  */
 struct _Context {
-    UInt size;        // number of function dependencies
-    UInt base_number; // for context compression & dump array
-    Context* next;    // entry chaining for hash
-    UWord hash;       // for faster lookup...
-    fn_node* fn[0];
+   UInt     size;        // number of function dependencies
+   UInt     base_number; // for context compression & dump array
+   Context* next;        // entry chaining for hash
+   UWord    hash;        // for faster lookup...
+   fn_node* fn[0];
 };
 
-
 /*
  * Cost info for a side exits from a BB
  */
 typedef struct _JmpData JmpData;
 struct _JmpData {
-    ULong ecounter; /* number of times the BB was left at this exit */
-    jCC*  jcc_list; /* JCCs used for this exit */
+   ULong ecounter; /* number of times the BB was left at this exit */
+   jCC*  jcc_list; /* JCCs used for this exit */
 };
 
-
 /*
  * Basic Block Cost Center
  *
@@ -365,54 +352,53 @@ struct _JmpData {
  * They are distinguishable by their tag field.
  */
 struct _BBCC {
-    BB*      bb;           /* BB for this cost center */
-
-    Context* cxt;          /* execution context of this BBCC */
-    ThreadId tid;          /* only for assertion check purpose */
-    UInt     rec_index;    /* Recursion index in rec->bbcc for this bbcc */
-    BBCC**   rec_array;    /* Variable sized array of pointers to
-			    * recursion BBCCs. Shared. */
-    BBCC*    next_bbcc;    /* Chain of BBCCs for same BB */
-    BBCC*    lru_next_bbcc; /* BBCC executed next the last time */
-
-    jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
-    jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
-    FullCost skipped;      /* cost for skipped functions called from
-			    * jmp_addr. Allocated lazy */
-
-    BBCC*    next;         /* entry chain in hash */
-    ULong*   cost;         /* start of 64bit costs for this BBCC */
-    ULong    ecounter_sum; /* execution counter for first instruction of BB */
-    JmpData  jmp[0];
+   BB* bb; /* BB for this cost center */
+
+   Context* cxt;        /* execution context of this BBCC */
+   ThreadId tid;        /* only for assertion check purpose */
+   UInt     rec_index;  /* Recursion index in rec->bbcc for this bbcc */
+   BBCC**   rec_array;  /* Variable sized array of pointers to
+                         * recursion BBCCs. Shared. */
+   BBCC* next_bbcc;     /* Chain of BBCCs for same BB */
+   BBCC* lru_next_bbcc; /* BBCC executed next the last time */
+
+   jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
+   jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
+   FullCost skipped;      /* cost for skipped functions called from
+                           * jmp_addr. Allocated lazy */
+
+   BBCC*   next;         /* entry chain in hash */
+   ULong*  cost;         /* start of 64bit costs for this BBCC */
+   ULong   ecounter_sum; /* execution counter for first instruction of BB */
+   JmpData jmp[0];
 };
 
-
 struct _fn_node {
-  HChar*     name;
-  UInt       name_len;
-  UInt       number;
-  Context*   last_cxt; /* LRU info */
-  Context*   pure_cxt; /* the context with only the function itself */
-  file_node* file;     /* reverse mapping for 2nd hash */
-  fn_node* next;
-
-  Bool toggle_collect :1;
-  Bool skip :1;
-  Bool pop_on_jump : 1;
-
-  Int  group;
-  Int  separate_callers;
-  Int  separate_recursions;
+   HChar*     name;
+   UInt       name_len;
+   UInt       number;
+   Context*   last_cxt; /* LRU info */
+   Context*   pure_cxt; /* the context with only the function itself */
+   file_node* file;     /* reverse mapping for 2nd hash */
+   fn_node*   next;
+
+   Bool toggle_collect : 1;
+   Bool skip : 1;
+   Bool pop_on_jump : 1;
+
+   Int group;
+   Int separate_callers;
+   Int separate_recursions;
 #if TG_ENABLE_DEBUG
-  Int  verbosity; /* Stores old verbosity level while in function */
+   Int verbosity; /* Stores old verbosity level while in function */
 #endif
 };
 
 /* Quite arbitrary fixed hash sizes */
 
-#define   N_OBJ_ENTRIES         47
-#define  N_FILE_ENTRIES         53
-#define    N_FN_ENTRIES         87
+#define N_OBJ_ENTRIES  47
+#define N_FILE_ENTRIES 53
+#define N_FN_ENTRIES   87
 
 struct _file_node {
    HChar*     name;
@@ -428,12 +414,12 @@ struct _file_node {
  */
 struct _obj_node {
    const HChar* name;
-   UInt       name_len;
-   UInt       last_slash_pos;
+   UInt         name_len;
+   UInt         last_slash_pos;
 
-   Addr       start;  /* Start address of text segment mapping */
-   SizeT      size;   /* Length of mapping */
-   PtrdiffT   offset; /* Offset between symbol address and file offset */
+   Addr     start;  /* Start address of text segment mapping */
+   SizeT    size;   /* Length of mapping */
+   PtrdiffT offset; /* Offset between symbol address and file offset */
 
    file_node* files[N_FILE_ENTRIES];
    UInt       number;
@@ -448,17 +434,16 @@ struct _obj_node {
  * instructions.
  */
 struct _call_entry {
-    jCC* jcc;           /* jCC for this call */
-    FullCost enter_cost; /* cost event counters at entering frame */
-    Addr sp;            /* stack pointer directly after call */
-    Addr ret_addr;      /* address to which to return to
-			 * is 0 on a simulated call */
-    BBCC* nonskipped;   /* see above */
-    Context* cxt;       /* context before call */
-    Int fn_sp;          /* function stack index before call */
+   jCC*     jcc;        /* jCC for this call */
+   FullCost enter_cost; /* cost event counters at entering frame */
+   Addr     sp;         /* stack pointer directly after call */
+   Addr     ret_addr;   /* address to which to return to
+                         * is 0 on a simulated call */
+   BBCC*    nonskipped; /* see above */
+   Context* cxt;        /* context before call */
+   Int      fn_sp;      /* function stack index before call */
 };
 
-
 /*
  * Execution state of main thread or a running signal handler in
  * a thread while interrupted by another signal handler.
@@ -473,36 +458,36 @@ struct _call_entry {
 typedef struct _exec_state exec_state;
 struct _exec_state {
 
-  /* the signum of the handler, 0 for main thread context
-   */
-  Int sig;
+   /* the signum of the handler, 0 for main thread context
+    */
+   Int sig;
 
-  /* the old call stack pointer at entering the signal handler */
-  Int orig_sp;
+   /* the old call stack pointer at entering the signal handler */
+   Int orig_sp;
 
-  FullCost cost;
-  Bool     collect;
-  Context* cxt;
+   FullCost cost;
+   Bool     collect;
+   Context* cxt;
 
-  /* number of conditional jumps passed in last BB */
-  Int   jmps_passed;
-  BBCC* bbcc;      /* last BB executed */
-  BBCC* nonskipped;
+   /* number of conditional jumps passed in last BB */
+   Int   jmps_passed;
+   BBCC* bbcc; /* last BB executed */
+   BBCC* nonskipped;
 
-  Int call_stack_bottom; /* Index into fn_stack */
+   Int call_stack_bottom; /* Index into fn_stack */
 };
 
 /* Global state structures */
 typedef struct _bb_hash bb_hash;
 struct _bb_hash {
-  UInt size, entries;
-  BB** table;
+   UInt size, entries;
+   BB** table;
 };
 
 typedef struct _cxt_hash cxt_hash;
 struct _cxt_hash {
-  UInt size, entries;
-  Context** table;
+   UInt      size, entries;
+   Context** table;
 };
 
 /* Thread specific state structures, i.e. parts of a thread state.
@@ -511,34 +496,34 @@ struct _cxt_hash {
  */
 typedef struct _bbcc_hash bbcc_hash;
 struct _bbcc_hash {
-  UInt size, entries;
-  BBCC** table;
+   UInt   size, entries;
+   BBCC** table;
 };
 
 typedef struct _jcc_hash jcc_hash;
 struct _jcc_hash {
-  UInt size, entries;
-  jCC** table;
-  jCC* spontaneous;
+   UInt  size, entries;
+   jCC** table;
+   jCC*  spontaneous;
 };
 
 typedef struct _fn_array fn_array;
 struct _fn_array {
-  UInt size;
-  UInt* array;
+   UInt  size;
+   UInt* array;
 };
 
 typedef struct _call_stack call_stack;
 struct _call_stack {
-  UInt size;
-  Int sp;
-  call_entry* entry;
+   UInt        size;
+   Int         sp;
+   call_entry* entry;
 };
 
 typedef struct _fn_stack fn_stack;
 struct _fn_stack {
-  UInt size;
-  fn_node **bottom, **top;
+   UInt      size;
+   fn_node **bottom, **top;
 };
 
 /* The maximum number of simultaneous running signal handlers per thread.
@@ -548,8 +533,8 @@ struct _fn_stack {
 
 typedef struct _exec_stack exec_stack;
 struct _exec_stack {
-  Int sp; /* > 0 if a handler is running */
-  exec_state* entry[MAX_SIGHANDLERS];
+   Int         sp; /* > 0 if a handler is running */
+   exec_state* entry[MAX_SIGHANDLERS];
 };
 
 /* Thread State
@@ -563,85 +548,82 @@ struct _exec_stack {
  */
 struct _thread_info {
 
-  /* state */
-  fn_stack fns;       /* function stack */
-  call_stack calls;   /* context call arc stack */
-  exec_stack states;  /* execution states interrupted by signals */
+   /* state */
+   fn_stack   fns;    /* function stack */
+   call_stack calls;  /* context call arc stack */
+   exec_stack states; /* execution states interrupted by signals */
 
-  /* cost tracking */
-  FullCost lastdump_cost;    /* Cost at last total cost computation */
+   /* cost tracking */
+   FullCost lastdump_cost; /* Cost at last total cost computation */
 
-  /* CSV trace: per-thread snapshot of cost at last sample emission */
-  FullCost last_sample_cost;
+   /* CSV trace: per-thread snapshot of cost at last sample emission */
+   FullCost last_sample_cost;
 
-  /* Inline tracking: current inline call stack (outermost first) */
-  const HChar* cur_inl_fns[TG_MAX_INL_DEPTH];
-  UInt cur_inl_depth;
+   /* Inline tracking: current inline call stack (outermost first) */
+   const HChar* cur_inl_fns[TG_MAX_INL_DEPTH];
+   UInt         cur_inl_depth;
 
-  /* thread specific data structure containers */
-  fn_array fn_active;
-  jcc_hash jccs;
-  bbcc_hash bbccs;
+   /* thread specific data structure containers */
+   fn_array  fn_active;
+   jcc_hash  jccs;
+   bbcc_hash bbccs;
 };
 
 /*------------------------------------------------------------*/
 /*--- Cache simulator interface                            ---*/
 /*------------------------------------------------------------*/
 
-struct cachesim_if
-{
-    void (*print_opts)(void);
-    Bool (*parse_opt)(const HChar* arg);
-    void (*post_clo_init)(void);
-    void (*clear)(void);
-    void (*printstat)(Int,Int,Int);
-    void (*finish)(void);
-
-    void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
-    void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
-    void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
-
-    void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
-    void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
-
-    void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
-    void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
-
-    // function names of helpers (for debugging generated code)
-    const HChar *log_1I0D_name, *log_2I0D_name, *log_3I0D_name;
-    const HChar *log_1I1Dr_name, *log_1I1Dw_name;
-    const HChar *log_0I1Dr_name, *log_0I1Dw_name;
+struct cachesim_if {
+   void (*print_opts)(void);
+   Bool (*parse_opt)(const HChar* arg);
+   void (*post_clo_init)(void);
+   void (*clear)(void);
+   void (*printstat)(Int, Int, Int);
+   void (*finish)(void);
+
+   void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
+   void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
+   void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
+
+   void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+   void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+
+   void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+   void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+
+   // function names of helpers (for debugging generated code)
+   const HChar *log_1I0D_name, *log_2I0D_name, *log_3I0D_name;
+   const HChar *log_1I1Dr_name, *log_1I1Dw_name;
+   const HChar *log_0I1Dr_name, *log_0I1Dw_name;
 };
 
 // Event groups
-#define EG_USE   0
-#define EG_IR    1
-#define EG_DR    2
-#define EG_DW    3
-#define EG_BC    4
-#define EG_BI    5
-#define EG_BUS   6
-#define EG_SYS   7
+#define EG_USE 0
+#define EG_IR  1
+#define EG_DR  2
+#define EG_DW  3
+#define EG_BC  4
+#define EG_BI  5
+#define EG_BUS 6
+#define EG_SYS 7
 
 struct event_sets {
-    EventSet *base, *full;
+   EventSet *base, *full;
 };
 
 #define fullOffset(group) (TG_(sets).full->offset[group])
 
-
 /*------------------------------------------------------------*/
 /*--- Trace output state                                   ---*/
 /*------------------------------------------------------------*/
 
 typedef struct {
-    Int       fd;              /* Output file descriptor (-1 if not open) */
-    ULong     seq;             /* Global sequence counter */
-    Bool      initialized;     /* Has the output been opened? */
-    Bool      header_written;  /* Has the schema chunk been written? */
+   Int   fd;             /* Output file descriptor (-1 if not open) */
+   ULong seq;            /* Global sequence counter */
+   Bool  initialized;    /* Has the output been opened? */
+   Bool  header_written; /* Has the schema chunk been written? */
 } trace_output;
 
-
 /*------------------------------------------------------------*/
 /*--- Functions                                            ---*/
 /*------------------------------------------------------------*/
@@ -658,47 +640,51 @@ void TG_(print_debug_usage)(void);
 void TG_(init_eventsets)(void);
 
 /* from main.c */
-Bool TG_(get_debug_info)(Addr, const HChar **dirname,
-                          const HChar **filename,
-                          const HChar **fn_name, UInt*, DebugInfo**);
+Bool TG_(get_debug_info)(Addr,
+                         const HChar** dirname,
+                         const HChar** filename,
+                         const HChar** fn_name,
+                         UInt*,
+                         DebugInfo**);
 void TG_(collectBlockInfo)(IRSB* bbIn, UInt*, UInt*, Bool*);
-void TG_(set_instrument_state)(const HChar*,Bool);
+void TG_(set_instrument_state)(const HChar*, Bool);
 void TG_(compute_total_cost)(void);
 void TG_(fini)(Int exitcode);
 
 /* from bb.c */
-void TG_(init_bb_hash)(void);
+void     TG_(init_bb_hash)(void);
 bb_hash* TG_(get_bb_hash)(void);
-BB*  TG_(get_bb)(Addr addr, IRSB* bb_in, Bool *seen_before);
-void TG_(delete_bb)(Addr addr);
+BB*      TG_(get_bb)(Addr addr, IRSB* bb_in, Bool* seen_before);
+void     TG_(delete_bb)(Addr addr);
 
-static __inline__ Addr bb_addr(BB* bb)
- { return bb->offset + bb->obj->offset; }
+static __inline__ Addr bb_addr(BB* bb) { return bb->offset + bb->obj->offset; }
 static __inline__ Addr bb_jmpaddr(BB* bb)
- { UInt off = (bb->instr_count > 0) ? bb->instr[bb->instr_count-1].instr_offset : 0;
-   return off + bb->offset + bb->obj->offset; }
+{
+   UInt off =
+      (bb->instr_count > 0) ? bb->instr[bb->instr_count - 1].instr_offset : 0;
+   return off + bb->offset + bb->obj->offset;
+}
 
 /* from fn.c */
-void TG_(init_fn_array)(fn_array*);
-void TG_(copy_current_fn_array)(fn_array* dst);
+void      TG_(init_fn_array)(fn_array*);
+void      TG_(copy_current_fn_array)(fn_array* dst);
 fn_array* TG_(get_current_fn_array)(void);
-void TG_(set_current_fn_array)(fn_array*);
-UInt* TG_(get_fn_entry)(Int n);
+void      TG_(set_current_fn_array)(fn_array*);
+UInt*     TG_(get_fn_entry)(Int n);
 
 void      TG_(init_obj_table)(void);
 obj_node* TG_(get_obj_node)(DebugInfo* si);
-file_node* TG_(get_file_node)(obj_node*, const HChar *dirname,
-                               const HChar* filename);
-fn_node*  TG_(get_fn_node)(BB* bb);
+file_node*
+   TG_(get_file_node)(obj_node*, const HChar* dirname, const HChar* filename);
+fn_node* TG_(get_fn_node)(BB* bb);
 
 /* from bbcc.c */
-void TG_(init_bbcc_hash)(bbcc_hash* bbccs);
-void TG_(copy_current_bbcc_hash)(bbcc_hash* dst);
+void       TG_(init_bbcc_hash)(bbcc_hash* bbccs);
+void       TG_(copy_current_bbcc_hash)(bbcc_hash* dst);
 bbcc_hash* TG_(get_current_bbcc_hash)(void);
-void TG_(set_current_bbcc_hash)(bbcc_hash*);
-BBCC* TG_(get_bbcc)(BB* bb);
-void TG_(setup_bbcc)(BB* bb) VG_REGPARM(1);
-
+void       TG_(set_current_bbcc_hash)(bbcc_hash*);
+BBCC*      TG_(get_bbcc)(BB* bb);
+void       TG_(setup_bbcc)(BB* bb) VG_REGPARM(1);
 
 /* from jumps.c */
 void TG_(init_jcc_hash)(jcc_hash*);
@@ -707,31 +693,31 @@ void TG_(set_current_jcc_hash)(jcc_hash*);
 jCC* TG_(get_jcc)(BBCC* from, UInt, BBCC* to);
 
 /* from callstack.c */
-void TG_(init_call_stack)(call_stack*);
-void TG_(copy_current_call_stack)(call_stack* dst);
-void TG_(set_current_call_stack)(call_stack*);
+void        TG_(init_call_stack)(call_stack*);
+void        TG_(copy_current_call_stack)(call_stack* dst);
+void        TG_(set_current_call_stack)(call_stack*);
 call_entry* TG_(get_call_entry)(Int n);
 
 void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip);
 void TG_(pop_call_stack)(void);
-Int TG_(unwind_call_stack)(Addr sp, Int);
+Int  TG_(unwind_call_stack)(Addr sp, Int);
 
 /* from context.c */
 void TG_(init_fn_stack)(fn_stack*);
 void TG_(copy_current_fn_stack)(fn_stack*);
 void TG_(set_current_fn_stack)(fn_stack*);
 
-void TG_(init_cxt_table)(void);
+void     TG_(init_cxt_table)(void);
 Context* TG_(get_cxt)(fn_node** fn);
-void TG_(push_cxt)(fn_node* fn);
+void     TG_(push_cxt)(fn_node* fn);
 
 /* from threads.c */
-void TG_(init_threads)(void);
+void          TG_(init_threads)(void);
 thread_info** TG_(get_threads)(void);
-thread_info* TG_(get_current_thread)(void);
-void TG_(switch_thread)(ThreadId tid);
-void TG_(forall_threads)(void (*func)(thread_info*));
-void TG_(run_thread)(ThreadId tid);
+thread_info*  TG_(get_current_thread)(void);
+void          TG_(switch_thread)(ThreadId tid);
+void          TG_(forall_threads)(void (*func)(thread_info*));
+void          TG_(run_thread)(ThreadId tid);
 
 void TG_(init_exec_state)(exec_state* es);
 void TG_(init_exec_stack)(exec_stack*);
@@ -759,20 +745,20 @@ void TG_(trace_close_output)(void);
 /*------------------------------------------------------------*/
 
 extern CommandLineOptions TG_(clo);
-extern Statistics TG_(stat);
-extern EventMapping* TG_(dumpmap);
-extern trace_output TG_(trace_out);
+extern Statistics         TG_(stat);
+extern EventMapping*      TG_(dumpmap);
+extern trace_output       TG_(trace_out);
 
 /* Function active counter array, indexed by function number */
 extern UInt* TG_(fn_active_array);
-extern Bool TG_(instrument_state);
- /* min of L1 and LL cache line sizes */
-extern Int TG_(min_line_size);
-extern call_stack TG_(current_call_stack);
-extern fn_stack   TG_(current_fn_stack);
-extern exec_state TG_(current_state);
-extern ThreadId   TG_(current_tid);
-extern FullCost   TG_(total_cost);
+extern Bool  TG_(instrument_state);
+/* min of L1 and LL cache line sizes */
+extern Int                TG_(min_line_size);
+extern call_stack         TG_(current_call_stack);
+extern fn_stack           TG_(current_fn_stack);
+extern exec_state         TG_(current_state);
+extern ThreadId           TG_(current_tid);
+extern FullCost           TG_(total_cost);
 extern struct cachesim_if TG_(cachesim);
 extern struct event_sets  TG_(sets);
 
@@ -780,33 +766,35 @@ extern struct event_sets  TG_(sets);
 extern Addr   TG_(bb_base);
 extern ULong* TG_(cost_base);
 
-
 /*------------------------------------------------------------*/
 /*--- Debug output                                         ---*/
 /*------------------------------------------------------------*/
 
 #if TG_ENABLE_DEBUG
 
-#define TG_DEBUGIF(x) \
-  if (UNLIKELY( (TG_(clo).verbose >x) && \
+#define TG_DEBUGIF(x)                                                          \
+   if (UNLIKELY((TG_(clo).verbose > x) &&                                      \
                 (TG_(stat).bb_executions >= TG_(clo).verbose_start)))
 
-#define TG_DEBUG(x,format,args...)   \
-    TG_DEBUGIF(x) {                  \
-      TG_(print_bbno)();	      \
-      VG_(printf)(format,##args);     \
-    }
+#define TG_DEBUG(x, format, args...)                                           \
+   TG_DEBUGIF(x)                                                               \
+   {                                                                           \
+      TG_(print_bbno)();                                                       \
+      VG_(printf)(format, ##args);                                             \
+   }
 
-#define TG_ASSERT(cond)              \
-    if (UNLIKELY(!(cond))) {          \
-      TG_(print_context)();          \
-      TG_(print_bbno)();	      \
-      tl_assert(cond);                \
-     }
+#define TG_ASSERT(cond)                                                        \
+   if (UNLIKELY(!(cond))) {                                                    \
+      TG_(print_context)();                                                    \
+      TG_(print_bbno)();                                                       \
+      tl_assert(cond);                                                         \
+   }
 
 #else
 #define TG_DEBUGIF(x) if (0)
-#define TG_DEBUG(x...) {}
+#define TG_DEBUG(x...)                                                         \
+   {                                                                           \
+   }
 #define TG_ASSERT(cond) tl_assert(cond);
 #endif
 
@@ -830,11 +818,11 @@ void TG_(print_addr_ln)(Addr addr);
 void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f);
 void* TG_(free)(void* p, const HChar* f);
 #if 0
-#define TG_MALLOC(_cc,x) TG_(malloc)((_cc),x,__FUNCTION__)
-#define TG_FREE(p)       TG_(free)(p,__FUNCTION__)
+#define TG_MALLOC(_cc, x) TG_(malloc)((_cc), x, __FUNCTION__)
+#define TG_FREE(p)        TG_(free)(p, __FUNCTION__)
 #else
-#define TG_MALLOC(_cc,x) VG_(malloc)((_cc),x)
-#define TG_FREE(p)       VG_(free)(p)
+#define TG_MALLOC(_cc, x) VG_(malloc)((_cc), x)
+#define TG_FREE(p)        VG_(free)(p)
 #endif
 
 #endif /* TG_GLOBAL */
diff --git a/tracegrind/jumps.c b/tracegrind/jumps.c
index d74deba41..f25d062cb 100644
--- a/tracegrind/jumps.c
+++ b/tracegrind/jumps.c
@@ -30,12 +30,10 @@
 /*--- Jump Cost Center (JCC) operations, including Calls   ---*/
 /*------------------------------------------------------------*/
 
-#define N_JCC_INITIAL_ENTRIES  4437
+#define N_JCC_INITIAL_ENTRIES 4437
 
 static jcc_hash current_jccs;
 
-
-
 void TG_(init_jcc_hash)(jcc_hash* jccs)
 {
    Int i;
@@ -44,93 +42,87 @@ void TG_(init_jcc_hash)(jcc_hash* jccs)
 
    jccs->size    = N_JCC_INITIAL_ENTRIES;
    jccs->entries = 0;
-   jccs->table = (jCC**) TG_MALLOC("cl.jumps.ijh.1",
-                                    jccs->size * sizeof(jCC*));
+   jccs->table = (jCC**)TG_MALLOC("cl.jumps.ijh.1", jccs->size * sizeof(jCC*));
    jccs->spontaneous = 0;
 
    for (i = 0; i < jccs->size; i++)
-     jccs->table[i] = 0;
+      jccs->table[i] = 0;
 }
 
-
 void TG_(copy_current_jcc_hash)(jcc_hash* dst)
 {
-  TG_ASSERT(dst != 0);
+   TG_ASSERT(dst != 0);
 
-  dst->size        = current_jccs.size;
-  dst->entries     = current_jccs.entries;
-  dst->table       = current_jccs.table;
-  dst->spontaneous = current_jccs.spontaneous;
+   dst->size        = current_jccs.size;
+   dst->entries     = current_jccs.entries;
+   dst->table       = current_jccs.table;
+   dst->spontaneous = current_jccs.spontaneous;
 }
 
 void TG_(set_current_jcc_hash)(jcc_hash* h)
 {
-  TG_ASSERT(h != 0);
+   TG_ASSERT(h != 0);
 
-  current_jccs.size        = h->size;
-  current_jccs.entries     = h->entries;
-  current_jccs.table       = h->table;
-  current_jccs.spontaneous = h->spontaneous;
+   current_jccs.size        = h->size;
+   current_jccs.entries     = h->entries;
+   current_jccs.table       = h->table;
+   current_jccs.spontaneous = h->spontaneous;
 }
 
-__inline__
-static UInt jcc_hash_idx(BBCC* from, UInt jmp, BBCC* to, UInt size)
+__inline__ static UInt jcc_hash_idx(BBCC* from, UInt jmp, BBCC* to, UInt size)
 {
-  return (UInt) ( (UWord)from + 7* (UWord)to + 13*jmp) % size;
-} 
+   return (UInt)((UWord)from + 7 * (UWord)to + 13 * jmp) % size;
+}
 
 /* double size of jcc table  */
 static void resize_jcc_table(void)
 {
-    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
-    jCC** new_table;
-    UInt new_idx;
-    jCC *curr_jcc, *next_jcc;
-
-    new_size  = 2* current_jccs.size +3;
-    new_table = (jCC**) TG_MALLOC("cl.jumps.rjt.1",
-                                   new_size * sizeof(jCC*));
- 
-    for (i = 0; i < new_size; i++)
+   Int   i, new_size, conflicts1 = 0, conflicts2 = 0;
+   jCC** new_table;
+   UInt  new_idx;
+   jCC * curr_jcc, *next_jcc;
+
+   new_size  = 2 * current_jccs.size + 3;
+   new_table = (jCC**)TG_MALLOC("cl.jumps.rjt.1", new_size * sizeof(jCC*));
+
+   for (i = 0; i < new_size; i++)
       new_table[i] = NULL;
- 
-    for (i = 0; i < current_jccs.size; i++) {
-	if (current_jccs.table[i] == NULL) continue;
- 
-	curr_jcc = current_jccs.table[i];
-	while (NULL != curr_jcc) {
-	    next_jcc = curr_jcc->next_hash;
-
-	    new_idx = jcc_hash_idx(curr_jcc->from, curr_jcc->jmp,
-				    curr_jcc->to, new_size);
-
-	    curr_jcc->next_hash = new_table[new_idx];
-	    new_table[new_idx] = curr_jcc;
-	    if (curr_jcc->next_hash) {
-		conflicts1++;
-		if (curr_jcc->next_hash->next_hash)
-		    conflicts2++;
-	    }
-
-	    curr_jcc = next_jcc;
-	}
-    }
-
-    VG_(free)(current_jccs.table);
-
-
-    TG_DEBUG(0, "Resize JCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
-	     current_jccs.size, new_size,
-	     current_jccs.entries, conflicts1, conflicts2);
-
-    current_jccs.size  = new_size;
-    current_jccs.table = new_table;
-    TG_(stat).jcc_hash_resizes++;
-}
 
+   for (i = 0; i < current_jccs.size; i++) {
+      if (current_jccs.table[i] == NULL)
+         continue;
+
+      curr_jcc = current_jccs.table[i];
+      while (NULL != curr_jcc) {
+         next_jcc = curr_jcc->next_hash;
+
+         new_idx =
+            jcc_hash_idx(curr_jcc->from, curr_jcc->jmp, curr_jcc->to, new_size);
 
+         curr_jcc->next_hash = new_table[new_idx];
+         new_table[new_idx]  = curr_jcc;
+         if (curr_jcc->next_hash) {
+            conflicts1++;
+            if (curr_jcc->next_hash->next_hash)
+               conflicts2++;
+         }
 
-/* new jCC structure: a call was done to a BB of a BBCC 
+         curr_jcc = next_jcc;
+      }
+   }
+
+   VG_(free)(current_jccs.table);
+
+   TG_DEBUG(0, "Resize JCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+            current_jccs.size, new_size, current_jccs.entries, conflicts1,
+            conflicts2);
+
+   current_jccs.size  = new_size;
+   current_jccs.table = new_table;
+   TG_(stat).jcc_hash_resizes++;
+}
+
+/* new jCC structure: a call was done to a BB of a BBCC
  * for a spontaneous call, from is 0 (i.e. caller unknown)
  */
 static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to)
@@ -141,93 +133,87 @@ static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to)
    /* check fill degree of jcc hash table and resize if needed (>80%) */
    current_jccs.entries++;
    if (10 * current_jccs.entries / current_jccs.size > 8)
-       resize_jcc_table();
+      resize_jcc_table();
 
-   jcc = (jCC*) TG_MALLOC("cl.jumps.nj.1", sizeof(jCC));
+   jcc = (jCC*)TG_MALLOC("cl.jumps.nj.1", sizeof(jCC));
 
-   jcc->from      = from;
-   jcc->jmp       = jmp;
-   jcc->to        = to;
-   jcc->jmpkind   = jk_Call;
+   jcc->from         = from;
+   jcc->jmp          = jmp;
+   jcc->to           = to;
+   jcc->jmpkind      = jk_Call;
    jcc->call_counter = 0;
-   jcc->cost = 0;
+   jcc->cost         = 0;
 
    /* insert into JCC chain of calling BBCC.
     * This list is only used at dumping time */
 
    if (from) {
-       /* Prohibit corruption by array overrun */
-       TG_ASSERT(jmp <= from->bb->cjmp_count);
-       jcc->next_from = from->jmp[jmp].jcc_list;
-       from->jmp[jmp].jcc_list = jcc;
-   }
-   else {
-       jcc->next_from = current_jccs.spontaneous;
-       current_jccs.spontaneous = jcc;
+      /* Prohibit corruption by array overrun */
+      TG_ASSERT(jmp <= from->bb->cjmp_count);
+      jcc->next_from          = from->jmp[jmp].jcc_list;
+      from->jmp[jmp].jcc_list = jcc;
+   } else {
+      jcc->next_from           = current_jccs.spontaneous;
+      current_jccs.spontaneous = jcc;
    }
 
    /* insert into JCC hash table */
-   new_idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
-   jcc->next_hash = current_jccs.table[new_idx];
+   new_idx                     = jcc_hash_idx(from, jmp, to, current_jccs.size);
+   jcc->next_hash              = current_jccs.table[new_idx];
    current_jccs.table[new_idx] = jcc;
 
    TG_(stat).distinct_jccs++;
 
-   TG_DEBUGIF(3) {
-     VG_(printf)("  new_jcc (now %d): %p\n",
-		 TG_(stat).distinct_jccs, jcc);
+   TG_DEBUGIF(3)
+   {
+      VG_(printf)("  new_jcc (now %d): %p\n", TG_(stat).distinct_jccs, jcc);
    }
 
    return jcc;
 }
 
-
 /* get the jCC for a call arc (BBCC->BBCC) */
 jCC* TG_(get_jcc)(BBCC* from, UInt jmp, BBCC* to)
 {
-    jCC* jcc;
-    UInt idx;
-
-    TG_DEBUG(5, "+ get_jcc(bbcc %p/%u => bbcc %p)\n",
-		from, jmp, to);
-
-    /* first check last recently used JCC */
-    jcc = to->lru_to_jcc;
-    if (jcc && (jcc->from == from) && (jcc->jmp == jmp)) {
-	TG_ASSERT(to == jcc->to);
-	TG_DEBUG(5,"- get_jcc: [LRU to] jcc %p\n", jcc);
-	return jcc;
-    }
-
-    jcc = from->lru_from_jcc;
-    if (jcc && (jcc->to == to) && (jcc->jmp == jmp)) {
-	TG_ASSERT(from == jcc->from);
-	TG_DEBUG(5, "- get_jcc: [LRU from] jcc %p\n", jcc);
-	return jcc;
-    }
-
-    TG_(stat).jcc_lru_misses++;
-
-    idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
-    jcc = current_jccs.table[idx];
-
-    while(jcc) {
-	if ((jcc->from == from) &&
-	    (jcc->jmp == jmp) &&
-	    (jcc->to == to)) break;
-	jcc = jcc->next_hash;
-    }
-
-    if (!jcc)
-	jcc = new_jcc(from, jmp, to);
-
-    /* set LRU */
-    from->lru_from_jcc = jcc;
-    to->lru_to_jcc = jcc;
-
-    TG_DEBUG(5, "- get_jcc(bbcc %p => bbcc %p)\n",
-		from, to);
-
-    return jcc;
-}
+   jCC* jcc;
+   UInt idx;
+
+   TG_DEBUG(5, "+ get_jcc(bbcc %p/%u => bbcc %p)\n", from, jmp, to);
+
+   /* first check last recently used JCC */
+   jcc = to->lru_to_jcc;
+   if (jcc && (jcc->from == from) && (jcc->jmp == jmp)) {
+      TG_ASSERT(to == jcc->to);
+      TG_DEBUG(5, "- get_jcc: [LRU to] jcc %p\n", jcc);
+      return jcc;
+   }
+
+   jcc = from->lru_from_jcc;
+   if (jcc && (jcc->to == to) && (jcc->jmp == jmp)) {
+      TG_ASSERT(from == jcc->from);
+      TG_DEBUG(5, "- get_jcc: [LRU from] jcc %p\n", jcc);
+      return jcc;
+   }
+
+   TG_(stat).jcc_lru_misses++;
 
+   idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
+   jcc = current_jccs.table[idx];
+
+   while (jcc) {
+      if ((jcc->from == from) && (jcc->jmp == jmp) && (jcc->to == to))
+         break;
+      jcc = jcc->next_hash;
+   }
+
+   if (!jcc)
+      jcc = new_jcc(from, jmp, to);
+
+   /* set LRU */
+   from->lru_from_jcc = jcc;
+   to->lru_to_jcc     = jcc;
+
+   TG_DEBUG(5, "- get_jcc(bbcc %p => bbcc %p)\n", from, to);
+
+   return jcc;
+}
diff --git a/tracegrind/lz4.c b/tracegrind/lz4.c
index a1f02e75d..e0af37e2b 100644
--- a/tracegrind/lz4.c
+++ b/tracegrind/lz4.c
@@ -33,16 +33,17 @@
 */
 
 /*-************************************
-*  Tuning parameters
-**************************************/
+ *  Tuning parameters
+ **************************************/
 /*
  * LZ4_HEAPMODE :
  * Select how stateless compression functions like `LZ4_compress_default()`
  * allocate memory for their hash table,
- * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ * in memory stack (0:default, fastest), or in memory heap (1:requires
+ * malloc()).
  */
 #ifndef LZ4_HEAPMODE
-#  define LZ4_HEAPMODE 0
+#define LZ4_HEAPMODE 0
 #endif
 
 /*
@@ -57,93 +58,105 @@
  */
 #define LZ4_ACCELERATION_MAX 65537
 
-
 /*-************************************
-*  CPU Feature Detection
-**************************************/
+ *  CPU Feature Detection
+ **************************************/
 /* LZ4_FORCE_MEMORY_ACCESS
- * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
- * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
- * The below switch allow to select different access method for improved performance.
- * Method 0 (default) : use `memcpy()`. Safe and portable.
- * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
- *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
- * Method 2 : direct access. This method is portable but violate C standard.
- *            It can generate buggy code on targets which assembly generation depends on alignment.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
- * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
- * Prefer these methods in priority order (0 > 1 > 2)
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable. Unfortunately, on some target/compiler combinations, the
+ * generated assembly is sub-optimal. The below switch allow to select different
+ * access method for improved performance. Method 0 (default) : use `memcpy()`.
+ * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
+ * extension (ie, not portable). This method is safe if your compiler supports
+ * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
+ * access. This method is portable but violate C standard. It can generate buggy
+ * code on targets which assembly generation depends on alignment. But in some
+ * circumstances, it's the only known way to get the most performance (ie GCC +
+ * ARMv6) See
+ * https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html
+ * for details. Prefer these methods in priority order (0 > 1 > 2)
  */
-#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
-#  if defined(__GNUC__) && \
-  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
-  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \
-  || (defined(__riscv) && defined(__riscv_zicclsm)) )
-#    define LZ4_FORCE_MEMORY_ACCESS 2
-#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) || defined(_MSC_VER)
-#    define LZ4_FORCE_MEMORY_ACCESS 1
-#  endif
+#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */
+#if defined(__GNUC__) &&                                                       \
+   (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||                     \
+    defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) ||                    \
+    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ||                  \
+    (defined(__riscv) && defined(__riscv_zicclsm)))
+#define LZ4_FORCE_MEMORY_ACCESS 2
+#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) ||  \
+   defined(_MSC_VER)
+#define LZ4_FORCE_MEMORY_ACCESS 1
+#endif
 #endif
 
 /*
  * LZ4_FORCE_SW_BITCOUNT
- * Define this parameter if your target system or compiler does not support hardware bit count
+ * Define this parameter if your target system or compiler does not support
+ * hardware bit count
  */
-#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
-#  undef  LZ4_FORCE_SW_BITCOUNT  /* avoid double def */
-#  define LZ4_FORCE_SW_BITCOUNT
+#if defined(_MSC_VER) &&                                                       \
+   defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit \
+                          count */
+#undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */
+#define LZ4_FORCE_SW_BITCOUNT
 #endif
 
-
-
 /*-************************************
-*  Dependency
-**************************************/
+ *  Dependency
+ **************************************/
 /*
  * LZ4_SRC_INCLUDED:
  * Amalgamation flag, whether lz4.c is included
  */
 #ifndef LZ4_SRC_INCLUDED
-#  define LZ4_SRC_INCLUDED 1
+#define LZ4_SRC_INCLUDED 1
 #endif
 
 #ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
-#  define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */
+#define LZ4_DISABLE_DEPRECATE_WARNINGS    /* due to                            \
+                                             LZ4_decompress_safe_withPrefix64k */
 #endif
 
 #ifndef LZ4_STATIC_LINKING_ONLY
-#  define LZ4_STATIC_LINKING_ONLY
+#define LZ4_STATIC_LINKING_ONLY
 #endif
 #include "lz4.h"
 /* see also "memory routines" below */
 
-
 /*-************************************
-*  Compiler Options
-**************************************/
-#if defined(_MSC_VER) && (_MSC_VER >= 1400)  /* Visual Studio 2005+ */
-#  include <intrin.h>               /* only present in VS2005+ */
-#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
-#  pragma warning(disable : 6237)   /* disable: C6237: conditional expression is always 0 */
-#  pragma warning(disable : 6239)   /* disable: C6239: (<non-zero constant> && <expression>) always evaluates to the result of <expression> */
-#  pragma warning(disable : 6240)   /* disable: C6240: (<expression> && <non-zero constant>) always evaluates to the result of <expression> */
-#  pragma warning(disable : 6326)   /* disable: C6326: Potential comparison of a constant with another constant */
-#endif  /* _MSC_VER */
+ *  Compiler Options
+ **************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */
+#include <intrin.h>                         /* only present in VS2005+ */
+#pragma warning(                                                               \
+   disable : 4127) /* disable: C4127: conditional expression is constant */
+#pragma warning(                                                               \
+   disable : 6237) /* disable: C6237: conditional expression is always 0 */
+#pragma warning(                                                               \
+   disable : 6239) /* disable: C6239: (<non-zero constant> && <expression>)    \
+                      always evaluates to the result of <expression> */
+#pragma warning(                                                               \
+   disable : 6240) /* disable: C6240: (<expression> && <non-zero constant>)    \
+                      always evaluates to the result of <expression> */
+#pragma warning(disable : 6326) /* disable: C6326: Potential comparison of a   \
+                                   constant with another constant */
+#endif                          /* _MSC_VER */
 
 #ifndef LZ4_FORCE_INLINE
-#  if defined (_MSC_VER) && !defined (__clang__)    /* MSVC */
-#    define LZ4_FORCE_INLINE static __forceinline
-#  else
-#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-#      if defined (__GNUC__) || defined (__clang__)
-#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
-#      else
-#        define LZ4_FORCE_INLINE static inline
-#      endif
-#    else
-#      define LZ4_FORCE_INLINE static
-#    endif /* __STDC_VERSION__ */
-#  endif  /* _MSC_VER */
+#if defined(_MSC_VER) && !defined(__clang__) /* MSVC */
+#define LZ4_FORCE_INLINE static __forceinline
+#else
+#if defined(__cplusplus) ||                                                    \
+   defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
+#if defined(__GNUC__) || defined(__clang__)
+#define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#else
+#define LZ4_FORCE_INLINE static inline
+#endif
+#else
+#define LZ4_FORCE_INLINE static
+#endif /* __STDC_VERSION__ */
+#endif /* _MSC_VER */
 #endif /* LZ4_FORCE_INLINE */
 
 /* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
@@ -160,37 +173,40 @@
  * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
  * of LZ4_wildCopy8 does not affect the compression speed.
  */
-#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
-#  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
-#  undef LZ4_FORCE_INLINE
-#  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) &&   \
+   !defined(__clang__)
+#define LZ4_FORCE_O2 __attribute__((optimize("O2")))
+#undef LZ4_FORCE_INLINE
+#define LZ4_FORCE_INLINE                                                       \
+   static __inline __attribute__((optimize("O2"), always_inline))
 #else
-#  define LZ4_FORCE_O2
+#define LZ4_FORCE_O2
 #endif
 
-#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
-#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                                  \
+   (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) ||                 \
+   defined(__clang__)
+#define expect(expr, value) (__builtin_expect((expr), (value)))
 #else
-#  define expect(expr,value)    (expr)
+#define expect(expr, value) (expr)
 #endif
 
 #ifndef likely
-#define likely(expr)     expect((expr) != 0, 1)
+#define likely(expr) expect((expr) != 0, 1)
 #endif
 #ifndef unlikely
-#define unlikely(expr)   expect((expr) != 0, 0)
+#define unlikely(expr) expect((expr) != 0, 0)
 #endif
 
 /* Should the alignment test prove unreliable, for some reason,
  * it can be disabled by setting LZ4_ALIGN_TEST to 0 */
-#ifndef LZ4_ALIGN_TEST  /* can be externally provided */
-# define LZ4_ALIGN_TEST 1
+#ifndef LZ4_ALIGN_TEST /* can be externally provided */
+#define LZ4_ALIGN_TEST 1
 #endif
 
-
 /*-************************************
-*  Memory routines
-**************************************/
+ *  Memory routines
+ **************************************/
 
 /*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION :
  *  Disable relatively high-level LZ4/HC functions that use dynamic memory
@@ -202,521 +218,603 @@
  *
  *  The following public functions are removed when this symbol is defined.
  *  - lz4   : LZ4_createStream, LZ4_freeStream,
- *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated)
+ *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create
+ * (deprecated)
  *  - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC,
  *            LZ4_createHC (deprecated), LZ4_freeHC  (deprecated)
  *  - lz4frame, lz4file : All LZ4F_* functions
  */
 #if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
-#  define ALLOC(s)          lz4_error_memory_allocation_is_disabled
-#  define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
-#  define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
+#define ALLOC(s)          lz4_error_memory_allocation_is_disabled
+#define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
+#define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
 #elif defined(LZ4_USER_MEMORY_FUNCTIONS)
 /* memory management functions can be customized by user project.
  * Below functions must exist somewhere in the Project
  * and be available at link time */
-void* LZ4_malloc(size_t s);
-void* LZ4_calloc(size_t n, size_t s);
-void  LZ4_free(void* p);
-# define ALLOC(s)          LZ4_malloc(s)
-# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s)
-# define FREEMEM(p)        LZ4_free(p)
+void*                      LZ4_malloc(size_t s);
+void*                      LZ4_calloc(size_t n, size_t s);
+void                       LZ4_free(void* p);
+#define ALLOC(s)          LZ4_malloc(s)
+#define ALLOC_AND_ZERO(s) LZ4_calloc(1, s)
+#define FREEMEM(p)        LZ4_free(p)
 #else
-# include <stdlib.h>   /* malloc, calloc, free */
-# define ALLOC(s)          malloc(s)
-# define ALLOC_AND_ZERO(s) calloc(1,s)
-# define FREEMEM(p)        free(p)
+#include <stdlib.h> /* malloc, calloc, free */
+#define ALLOC(s)          malloc(s)
+#define ALLOC_AND_ZERO(s) calloc(1, s)
+#define FREEMEM(p)        free(p)
 #endif
 
-#if ! LZ4_FREESTANDING
-#  include <string.h>   /* memset, memcpy */
+#if !LZ4_FREESTANDING
+#include <string.h> /* memset, memcpy */
 #endif
 #if !defined(LZ4_memset)
-#  define LZ4_memset(p,v,s) memset((p),(v),(s))
+#define LZ4_memset(p, v, s) memset((p), (v), (s))
 #endif
-#define MEM_INIT(p,v,s)   LZ4_memset((p),(v),(s))
-
+#define MEM_INIT(p, v, s) LZ4_memset((p), (v), (s))
 
 /*-************************************
-*  Common Constants
-**************************************/
+ *  Common Constants
+ **************************************/
 #define MINMATCH 4
 
 #define WILDCOPYLENGTH 8
-#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
-#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
-#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define LASTLITERALS                                                           \
+   5               /* see ../doc/lz4_Block_format.md#parsing-restrictions      \
+                    */
+#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE                                               \
+   ((2 * WILDCOPYLENGTH) -                                                     \
+    MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without      \
+                 overflowing output buffer */
 #define FASTLOOP_SAFE_DISTANCE 64
-static const int LZ4_minLength = (MFLIMIT+1);
+static const int LZ4_minLength = (MFLIMIT + 1);
 
-#define KB *(1 <<10)
-#define MB *(1 <<20)
-#define GB *(1U<<30)
+#define KB *(1 << 10)
+#define MB *(1 << 20)
+#define GB *(1U << 30)
 
 #define LZ4_DISTANCE_ABSOLUTE_MAX 65535
-#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
-#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#if (LZ4_DISTANCE_MAX >                                                        \
+     LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */
+#error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
 #endif
 
 #define ML_BITS  4
-#define ML_MASK  ((1U<<ML_BITS)-1)
-#define RUN_BITS (8-ML_BITS)
-#define RUN_MASK ((1U<<RUN_BITS)-1)
-
+#define ML_MASK  ((1U << ML_BITS) - 1)
+#define RUN_BITS (8 - ML_BITS)
+#define RUN_MASK ((1U << RUN_BITS) - 1)
 
 /*-************************************
-*  Error detection
-**************************************/
-#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
-#  include <assert.h>
+ *  Error detection
+ **************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 1)
+#include <assert.h>
 #else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
+#ifndef assert
+#define assert(condition) ((void)0)
+#endif
 #endif
 
-#define LZ4_STATIC_ASSERT(c)   { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use after variable declarations */
-
-#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2)
-#  include <stdio.h>
-   static int g_debuglog_enable = 1;
-#  define DEBUGLOG(l, ...) {                          \
-        if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
-            fprintf(stderr, __FILE__  " %i: ", __LINE__); \
-            fprintf(stderr, __VA_ARGS__);             \
-            fprintf(stderr, " \n");                   \
-    }   }
+#define LZ4_STATIC_ASSERT(c)                                                   \
+   {                                                                           \
+      enum { LZ4_static_assert = 1 / (int)(!!(c)) };                           \
+   } /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 2)
+#include <stdio.h>
+static int g_debuglog_enable = 1;
+#define DEBUGLOG(l, ...)                                                       \
+   {                                                                           \
+      if ((g_debuglog_enable) && (l <= LZ4_DEBUG)) {                           \
+         fprintf(stderr, __FILE__ " %i: ", __LINE__);                          \
+         fprintf(stderr, __VA_ARGS__);                                         \
+         fprintf(stderr, " \n");                                               \
+      }                                                                        \
+   }
 #else
-#  define DEBUGLOG(l, ...) {}    /* disabled */
+#define DEBUGLOG(l, ...)                                                       \
+   {                                                                           \
+   } /* disabled */
 #endif
 
 static int LZ4_isAligned(const void* ptr, size_t alignment)
 {
-    return ((size_t)ptr & (alignment -1)) == 0;
+   return ((size_t)ptr & (alignment - 1)) == 0;
 }
 
-
 /*-************************************
-*  Types
-**************************************/
+ *  Types
+ **************************************/
 #include <limits.h>
-#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-# include <stdint.h>
-  typedef unsigned char BYTE; /*uint8_t not necessarily blessed to alias arbitrary type*/
-  typedef uint16_t      U16;
-  typedef uint32_t      U32;
-  typedef  int32_t      S32;
-  typedef uint64_t      U64;
-  typedef uintptr_t     uptrval;
+#if defined(__cplusplus) ||                                                    \
+   (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#include <stdint.h>
+typedef unsigned char
+   BYTE; /*uint8_t not necessarily blessed to alias arbitrary type*/
+typedef uint16_t  U16;
+typedef uint32_t  U32;
+typedef int32_t   S32;
+typedef uint64_t  U64;
+typedef uintptr_t uptrval;
 #else
-# if UINT_MAX != 4294967295UL
-#   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
-# endif
-  typedef unsigned char       BYTE;
-  typedef unsigned short      U16;
-  typedef unsigned int        U32;
-  typedef   signed int        S32;
-  typedef unsigned long long  U64;
-  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#if UINT_MAX != 4294967295UL
+#error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+#endif
+typedef unsigned char      BYTE;
+typedef unsigned short     U16;
+typedef unsigned int       U32;
+typedef signed int         S32;
+typedef unsigned long long U64;
+typedef size_t             uptrval; /* generally true, except OpenVMS-64 */
 #endif
 
 #if defined(__x86_64__)
-  typedef U64    reg_t;   /* 64-bits in x32 mode */
+typedef U64 reg_t; /* 64-bits in x32 mode */
 #else
-  typedef size_t reg_t;   /* 32-bits in x32 mode */
+typedef size_t             reg_t;   /* 32-bits in x32 mode */
 #endif
 
 typedef enum {
-    notLimited = 0,
-    limitedOutput = 1,
-    fillOutput = 2
+   notLimited    = 0,
+   limitedOutput = 1,
+   fillOutput    = 2
 } limitedOutput_directive;
 
-
 /*-************************************
-*  Reading and writing into memory
-**************************************/
+ *  Reading and writing into memory
+ **************************************/
 
 /**
  * LZ4 relies on memcpy with a constant size being inlined. In freestanding
  * environments, the compiler can't assume the implementation of memcpy() is
  * standard compliant, so it can't apply its specialized memcpy() inlining
  * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
- * memcpy() as if it were standard compliant, so it can inline it in freestanding
- * environments. This is needed when decompressing the Linux Kernel, for example.
+ * memcpy() as if it were standard compliant, so it can inline it in
+ * freestanding environments. This is needed when decompressing the Linux
+ * Kernel, for example.
  */
 #if !defined(LZ4_memcpy)
-#  if defined(__GNUC__) && (__GNUC__ >= 4)
-#    define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
-#  else
-#    define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
-#  endif
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#else
+#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#endif
 #endif
 
 #if !defined(LZ4_memmove)
-#  if defined(__GNUC__) && (__GNUC__ >= 4)
-#    define LZ4_memmove __builtin_memmove
-#  else
-#    define LZ4_memmove memmove
-#  endif
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memmove __builtin_memmove
+#else
+#define LZ4_memmove memmove
+#endif
 #endif
 
 static unsigned LZ4_isLittleEndian(void)
 {
-    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
-    return one.c[0];
+   const union {
+      U32  u;
+      BYTE c[4];
+   } one = {1}; /* don't use static : performance detrimental */
+   return one.c[0];
 }
 
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
-#define LZ4_PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
+#define LZ4_PACK(__Declaration__) __Declaration__ __attribute__((__packed__))
 #elif defined(_MSC_VER)
-#define LZ4_PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
+#define LZ4_PACK(__Declaration__)                                              \
+   __pragma(pack(push, 1)) __Declaration__ __pragma(pack(pop))
 #endif
 
-#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS == 2)
 /* lie to the compiler about data alignment; use with caution */
 
-static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
-static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
-static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; }
+static U16   LZ4_read16(const void* memPtr) { return *(const U16*)memPtr; }
+static U32   LZ4_read32(const void* memPtr) { return *(const U32*)memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*)memPtr; }
 
 static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
 static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
 
-#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS == 1)
 
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers */
 /* currently only defined for gcc and icc */
 LZ4_PACK(typedef struct { U16 u16; }) LZ4_unalign16;
 LZ4_PACK(typedef struct { U32 u32; }) LZ4_unalign32;
 LZ4_PACK(typedef struct { reg_t uArch; }) LZ4_unalignST;
 
-static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign16*)ptr)->u16; }
-static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign32*)ptr)->u32; }
-static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalignST*)ptr)->uArch; }
+static U16 LZ4_read16(const void* ptr)
+{
+   return ((const LZ4_unalign16*)ptr)->u16;
+}
+static U32 LZ4_read32(const void* ptr)
+{
+   return ((const LZ4_unalign32*)ptr)->u32;
+}
+static reg_t LZ4_read_ARCH(const void* ptr)
+{
+   return ((const LZ4_unalignST*)ptr)->uArch;
+}
 
-static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign16*)memPtr)->u16 = value; }
-static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign32*)memPtr)->u32 = value; }
+static void LZ4_write16(void* memPtr, U16 value)
+{
+   ((LZ4_unalign16*)memPtr)->u16 = value;
+}
+static void LZ4_write32(void* memPtr, U32 value)
+{
+   ((LZ4_unalign32*)memPtr)->u32 = value;
+}
 
-#else  /* safe and portable access using memcpy() */
+#else /* safe and portable access using memcpy() */
 
 static U16 LZ4_read16(const void* memPtr)
 {
-    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+   U16 val;
+   LZ4_memcpy(&val, memPtr, sizeof(val));
+   return val;
 }
 
 static U32 LZ4_read32(const void* memPtr)
 {
-    U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+   U32 val;
+   LZ4_memcpy(&val, memPtr, sizeof(val));
+   return val;
 }
 
 static reg_t LZ4_read_ARCH(const void* memPtr)
 {
-    reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+   reg_t val;
+   LZ4_memcpy(&val, memPtr, sizeof(val));
+   return val;
 }
 
 static void LZ4_write16(void* memPtr, U16 value)
 {
-    LZ4_memcpy(memPtr, &value, sizeof(value));
+   LZ4_memcpy(memPtr, &value, sizeof(value));
 }
 
 static void LZ4_write32(void* memPtr, U32 value)
 {
-    LZ4_memcpy(memPtr, &value, sizeof(value));
+   LZ4_memcpy(memPtr, &value, sizeof(value));
 }
 
 #endif /* LZ4_FORCE_MEMORY_ACCESS */
 
-
 static U16 LZ4_readLE16(const void* memPtr)
 {
-    if (LZ4_isLittleEndian()) {
-        return LZ4_read16(memPtr);
-    } else {
-        const BYTE* p = (const BYTE*)memPtr;
-        return (U16)((U16)p[0] | (p[1]<<8));
-    }
+   if (LZ4_isLittleEndian()) {
+      return LZ4_read16(memPtr);
+   } else {
+      const BYTE* p = (const BYTE*)memPtr;
+      return (U16)((U16)p[0] | (p[1] << 8));
+   }
 }
 
 #ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT
 static U32 LZ4_readLE32(const void* memPtr)
 {
-    if (LZ4_isLittleEndian()) {
-        return LZ4_read32(memPtr);
-    } else {
-        const BYTE* p = (const BYTE*)memPtr;
-        return (U32)p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
-    }
+   if (LZ4_isLittleEndian()) {
+      return LZ4_read32(memPtr);
+   } else {
+      const BYTE* p = (const BYTE*)memPtr;
+      return (U32)p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+   }
 }
 #endif
 
 static void LZ4_writeLE16(void* memPtr, U16 value)
 {
-    if (LZ4_isLittleEndian()) {
-        LZ4_write16(memPtr, value);
-    } else {
-        BYTE* p = (BYTE*)memPtr;
-        p[0] = (BYTE) value;
-        p[1] = (BYTE)(value>>8);
-    }
+   if (LZ4_isLittleEndian()) {
+      LZ4_write16(memPtr, value);
+   } else {
+      BYTE* p = (BYTE*)memPtr;
+      p[0]    = (BYTE)value;
+      p[1]    = (BYTE)(value >> 8);
+   }
 }
 
-/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd
+ */
 LZ4_FORCE_INLINE
 void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
 {
-    BYTE* d = (BYTE*)dstPtr;
-    const BYTE* s = (const BYTE*)srcPtr;
-    BYTE* const e = (BYTE*)dstEnd;
+   BYTE*       d = (BYTE*)dstPtr;
+   const BYTE* s = (const BYTE*)srcPtr;
+   BYTE* const e = (BYTE*)dstEnd;
 
-    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+   do {
+      LZ4_memcpy(d, s, 8);
+      d += 8;
+      s += 8;
+   } while (d < e);
 }
 
-static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
-static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
-
+static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
 
 #ifndef LZ4_FAST_DEC_LOOP
-#  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
-#    define LZ4_FAST_DEC_LOOP 1
-#  elif defined(__aarch64__)
-#    if defined(__clang__) && defined(__ANDROID__)
-     /* On Android aarch64, we disable this optimization for clang because
-      * on certain mobile chipsets, performance is reduced with clang. For
-      * more information refer to https://github.com/lz4/lz4/pull/707 */
-#      define LZ4_FAST_DEC_LOOP 0
-#    else
-#      define LZ4_FAST_DEC_LOOP 1
-#    endif
-#  else
-#    define LZ4_FAST_DEC_LOOP 0
-#  endif
+#if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#define LZ4_FAST_DEC_LOOP 1
+#elif defined(__aarch64__)
+#if defined(__clang__) && defined(__ANDROID__)
+/* On Android aarch64, we disable this optimization for clang because
+ * on certain mobile chipsets, performance is reduced with clang. For
+ * more information refer to https://github.com/lz4/lz4/pull/707 */
+#define LZ4_FAST_DEC_LOOP 0
+#else
+#define LZ4_FAST_DEC_LOOP 1
+#endif
+#else
+#define LZ4_FAST_DEC_LOOP 0
+#endif
 #endif
 
 #if LZ4_FAST_DEC_LOOP
 
-LZ4_FORCE_INLINE void
-LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
-{
-    assert(srcPtr + offset == dstPtr);
-    if (offset < 8) {
-        LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
-        dstPtr[0] = srcPtr[0];
-        dstPtr[1] = srcPtr[1];
-        dstPtr[2] = srcPtr[2];
-        dstPtr[3] = srcPtr[3];
-        srcPtr += inc32table[offset];
-        LZ4_memcpy(dstPtr+4, srcPtr, 4);
-        srcPtr -= dec64table[offset];
-        dstPtr += 8;
-    } else {
-        LZ4_memcpy(dstPtr, srcPtr, 8);
-        dstPtr += 8;
-        srcPtr += 8;
-    }
-
-    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
-}
-
-/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
- * this version copies two times 16 bytes (instead of one time 32 bytes)
+LZ4_FORCE_INLINE void LZ4_memcpy_using_offset_base(BYTE*        dstPtr,
+                                                   const BYTE*  srcPtr,
+                                                   BYTE*        dstEnd,
+                                                   const size_t offset)
+{
+   assert(srcPtr + offset == dstPtr);
+   if (offset < 8) {
+      LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */
+      dstPtr[0] = srcPtr[0];
+      dstPtr[1] = srcPtr[1];
+      dstPtr[2] = srcPtr[2];
+      dstPtr[3] = srcPtr[3];
+      srcPtr += inc32table[offset];
+      LZ4_memcpy(dstPtr + 4, srcPtr, 4);
+      srcPtr -= dec64table[offset];
+      dstPtr += 8;
+   } else {
+      LZ4_memcpy(dstPtr, srcPtr, 8);
+      dstPtr += 8;
+      srcPtr += 8;
+   }
+
+   LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond
+ * dstEnd this version copies two times 16 bytes (instead of one time 32 bytes)
  * because it must be compatible with offsets >= 16. */
 LZ4_FORCE_INLINE void
 LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
 {
-    BYTE* d = (BYTE*)dstPtr;
-    const BYTE* s = (const BYTE*)srcPtr;
-    BYTE* const e = (BYTE*)dstEnd;
+   BYTE*       d = (BYTE*)dstPtr;
+   const BYTE* s = (const BYTE*)srcPtr;
+   BYTE* const e = (BYTE*)dstEnd;
 
-    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+   do {
+      LZ4_memcpy(d, s, 16);
+      LZ4_memcpy(d + 16, s + 16, 16);
+      d += 32;
+      s += 32;
+   } while (d < e);
 }
 
 /* LZ4_memcpy_using_offset()  presumes :
  * - dstEnd >= dstPtr + MINMATCH
  * - there is at least 12 bytes available to write after dstEnd */
-LZ4_FORCE_INLINE void
-LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
-{
-    BYTE v[8];
-
-    assert(dstEnd >= dstPtr + MINMATCH);
-
-    switch(offset) {
-    case 1:
-        MEM_INIT(v, *srcPtr, 8);
-        break;
-    case 2:
-        LZ4_memcpy(v, srcPtr, 2);
-        LZ4_memcpy(&v[2], srcPtr, 2);
-#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */
-#  pragma warning(push)
-#  pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
+LZ4_FORCE_INLINE void LZ4_memcpy_using_offset(BYTE*        dstPtr,
+                                              const BYTE*  srcPtr,
+                                              BYTE*        dstEnd,
+                                              const size_t offset)
+{
+   BYTE v[8];
+
+   assert(dstEnd >= dstPtr + MINMATCH);
+
+   switch (offset) {
+   case 1:
+      MEM_INIT(v, *srcPtr, 8);
+      break;
+   case 2:
+      LZ4_memcpy(v, srcPtr, 2);
+      LZ4_memcpy(&v[2], srcPtr, 2);
+#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier   \
+                                             */
+#pragma warning(push)
+#pragma warning(                                                               \
+   disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
 #endif
-        LZ4_memcpy(&v[4], v, 4);
-#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */
-#  pragma warning(pop)
+      LZ4_memcpy(&v[4], v, 4);
+#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier   \
+                                             */
+#pragma warning(pop)
 #endif
-        break;
-    case 4:
-        LZ4_memcpy(v, srcPtr, 4);
-        LZ4_memcpy(&v[4], srcPtr, 4);
-        break;
-    default:
-        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
-        return;
-    }
-
-    LZ4_memcpy(dstPtr, v, 8);
-    dstPtr += 8;
-    while (dstPtr < dstEnd) {
-        LZ4_memcpy(dstPtr, v, 8);
-        dstPtr += 8;
-    }
+      break;
+   case 4:
+      LZ4_memcpy(v, srcPtr, 4);
+      LZ4_memcpy(&v[4], srcPtr, 4);
+      break;
+   default:
+      LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+      return;
+   }
+
+   LZ4_memcpy(dstPtr, v, 8);
+   dstPtr += 8;
+   while (dstPtr < dstEnd) {
+      LZ4_memcpy(dstPtr, v, 8);
+      dstPtr += 8;
+   }
 }
 #endif
 
-
 /*-************************************
-*  Common functions
-**************************************/
-static unsigned LZ4_NbCommonBytes (reg_t val)
-{
-    assert(val != 0);
-    if (LZ4_isLittleEndian()) {
-        if (sizeof(val) == 8) {
-#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ *  Common functions
+ **************************************/
+static unsigned LZ4_NbCommonBytes(reg_t val)
+{
+   assert(val != 0);
+   if (LZ4_isLittleEndian()) {
+      if (sizeof(val) == 8) {
+#if defined(_MSC_VER) && (_MSC_VER >= 1800) &&                                 \
+   (defined(_M_AMD64) && !defined(_M_ARM64EC)) &&                              \
+   !defined(LZ4_FORCE_SW_BITCOUNT)
 /*-*************************************************************************************************
-* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11.
-* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics
-* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC.
-****************************************************************************************************/
-#         if defined(__clang__) && (__clang_major__ < 10)
-            /* Avoid undefined clang-cl intrinsics issue.
-             * See https://github.com/lz4/lz4/pull/1017 for details. */
-            return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
-#         else
-            /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
-            return (unsigned)_tzcnt_u64(val) >> 3;
-#         endif
-#       elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanForward64(&r, (U64)val);
-            return (unsigned)r >> 3;
-#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
-                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
-                                        !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (unsigned)__builtin_ctzll((U64)val) >> 3;
-#       else
-            const U64 m = 0x0101010101010101ULL;
-            val ^= val - 1;
-            return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
-#       endif
-        } else /* 32 bits */ {
-#       if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r;
-            _BitScanForward(&r, (U32)val);
-            return (unsigned)r >> 3;
-#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
-                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
-                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (unsigned)__builtin_ctz((U32)val) >> 3;
-#       else
-            const U32 m = 0x01010101;
-            return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
-#       endif
-        }
-    } else   /* Big Endian CPU */ {
-        if (sizeof(val)==8) {
-#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
-                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
-                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (unsigned)__builtin_clzll((U64)val) >> 3;
-#       else
-#if 1
-            /* this method is probably faster,
-             * but adds a 128 bytes lookup table */
-            static const unsigned char ctz7_tab[128] = {
-                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-            };
-            U64 const mask = 0x0101010101010101ULL;
-            U64 const t = (((val >> 8) - mask) | val) & mask;
-            return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+ * ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications
+ *on ARM64 Windows 11. The ARM64EC ABI does not support AVX/AVX2/AVX512
+ *instructions, nor their relevant intrinsics including _tzcnt_u64. Therefore,
+ *we need to neuter the _tzcnt_u64 code path for ARM64EC.
+ ****************************************************************************************************/
+#if defined(__clang__) && (__clang_major__ < 10)
+         /* Avoid undefined clang-cl intrinsics issue.
+          * See https://github.com/lz4/lz4/pull/1017 for details. */
+         return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
+#else
+         /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+         return (unsigned)_tzcnt_u64(val) >> 3;
+#endif
+#elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         unsigned long r = 0;
+         _BitScanForward64(&r, (U64)val);
+         return (unsigned)r >> 3;
+#elif (defined(__clang__) ||                                                   \
+       (defined(__GNUC__) &&                                                   \
+        ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&    \
+   !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_ctzll((U64)val) >> 3;
+#else
+         const U64 m = 0x0101010101010101ULL;
+         val ^= val - 1;
+         return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
+#endif
+      } else /* 32 bits */ {
+#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         unsigned long r;
+         _BitScanForward(&r, (U32)val);
+         return (unsigned)r >> 3;
+#elif (defined(__clang__) ||                                                   \
+       (defined(__GNUC__) &&                                                   \
+        ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&    \
+   !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_ctz((U32)val) >> 3;
 #else
-            /* this method doesn't consume memory space like the previous one,
-             * but it contains several branches,
-             * that may end up slowing execution */
-            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
-            Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
-            Note that this code path is never triggered in 32-bits mode. */
-            unsigned r;
-            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
+         const U32 m = 0x01010101;
+         return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
 #endif
-#       endif
-        } else /* 32 bits */ {
-#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
-                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
-                                        !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (unsigned)__builtin_clz((U32)val) >> 3;
-#       else
+      }
+   } else /* Big Endian CPU */ {
+      if (sizeof(val) == 8) {
+#if (defined(__clang__) ||                                                     \
+     (defined(__GNUC__) &&                                                     \
+      ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&      \
+   !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_clzll((U64)val) >> 3;
+#else
+#if 1
+         /* this method is probably faster,
+          * but adds a 128 bytes lookup table */
+         static const unsigned char ctz7_tab[128] = {
+            7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0,
+            1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
+            2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0,
+            1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0,
+            3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
+            1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+         };
+         U64 const mask = 0x0101010101010101ULL;
+         U64 const t    = (((val >> 8) - mask) | val) & mask;
+         return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+         /* this method doesn't consume memory space like the previous one,
+          * but it contains several branches,
+          * that may end up slowing execution */
+         static const U32 by32 =
+            sizeof(val) * 4; /* 32 on 64 bits (goal), 16 on 32 bits.
+Just to avoid some static analyzer complaining about shift by 32 on 32-bits
+target. Note that this code path is never triggered in 32-bits mode. */
+         unsigned r;
+         if (!(val >> by32)) {
+            r = 4;
+         } else {
+            r = 0;
+            val >>= by32;
+         }
+         if (!(val >> 16)) {
+            r += 2;
             val >>= 8;
-            val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) |
-              (val + 0x00FF0000)) >> 24;
-            return (unsigned)val ^ 3;
-#       endif
-        }
-    }
+         } else {
+            val >>= 24;
+         }
+         r += (!val);
+         return r;
+#endif
+#endif
+      } else /* 32 bits */ {
+#if (defined(__clang__) ||                                                     \
+     (defined(__GNUC__) &&                                                     \
+      ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&      \
+   !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_clz((U32)val) >> 3;
+#else
+         val >>= 8;
+         val =
+            ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | (val + 0x00FF0000)) >>
+            24;
+         return (unsigned)val ^ 3;
+#endif
+      }
+   }
 }
 
-
 #define STEPSIZE sizeof(reg_t)
 LZ4_FORCE_INLINE
 unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
 {
-    const BYTE* const pStart = pIn;
-
-    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
-        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
-        if (!diff) {
-            pIn+=STEPSIZE; pMatch+=STEPSIZE;
-        } else {
-            return LZ4_NbCommonBytes(diff);
-    }   }
-
-    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
-        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
-        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
-        pIn += LZ4_NbCommonBytes(diff);
-        return (unsigned)(pIn - pStart);
-    }
-
-    if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
-    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
-    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
-    return (unsigned)(pIn - pStart);
+   const BYTE* const pStart = pIn;
+
+   if (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+      reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+      if (!diff) {
+         pIn += STEPSIZE;
+         pMatch += STEPSIZE;
+      } else {
+         return LZ4_NbCommonBytes(diff);
+      }
+   }
+
+   while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+      reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+      if (!diff) {
+         pIn += STEPSIZE;
+         pMatch += STEPSIZE;
+         continue;
+      }
+      pIn += LZ4_NbCommonBytes(diff);
+      return (unsigned)(pIn - pStart);
+   }
+
+   if ((STEPSIZE == 8) && (pIn < (pInLimit - 3)) &&
+       (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
+      pIn += 4;
+      pMatch += 4;
+   }
+   if ((pIn < (pInLimit - 1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
+      pIn += 2;
+      pMatch += 2;
+   }
+   if ((pIn < pInLimit) && (*pMatch == *pIn))
+      pIn++;
+   return (unsigned)(pIn - pStart);
 }
 
-
 #ifndef LZ4_COMMONDEFS_ONLY
 /*-************************************
-*  Local Constants
-**************************************/
-static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
-static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
-
+ *  Local Constants
+ **************************************/
+static const int LZ4_64Klimit    = ((64 KB) + (MFLIMIT - 1));
+static const U32 LZ4_skipTrigger = 6; /* Increase this value ==> compression run
+                                         slower on incompressible data */
 
 /*-************************************
-*  Local Structures and types
-**************************************/
+ *  Local Structures and types
+ **************************************/
 typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
 
 /**
@@ -742,109 +840,155 @@ typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
  *                   content, and matches are found by looking in the ctx
  *                   ->dictCtx->hashTable.
  */
-typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum {
+   noDict = 0,
+   withPrefix64k,
+   usingExtDict,
+   usingDictCtx
+} dict_directive;
 typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
 
-
 /*-************************************
-*  Local Utils
-**************************************/
-int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+ *  Local Utils
+ **************************************/
+int         LZ4_versionNumber(void) { return LZ4_VERSION_NUMBER; }
 const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
-int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
-int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
-
+int         LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
+int         LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
 
 /*-****************************************
-*  Internal Definitions, used only in Tests
-*******************************************/
-#if defined (__cplusplus)
+ *  Internal Definitions, used only in Tests
+ *******************************************/
+#if defined(__cplusplus)
 extern "C" {
 #endif
 
-int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize);
-
-int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
-                                     int compressedSize, int maxOutputSize,
-                                     const void* dictStart, size_t dictSize);
-int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
-                                     int compressedSize, int targetOutputSize, int dstCapacity,
-                                     const void* dictStart, size_t dictSize);
-#if defined (__cplusplus)
+int LZ4_compress_forceExtDict(LZ4_stream_t* LZ4_dict,
+                              const char*   source,
+                              char*         dest,
+                              int           srcSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* source,
+                                     char*       dest,
+                                     int         compressedSize,
+                                     int         maxOutputSize,
+                                     const void* dictStart,
+                                     size_t      dictSize);
+int LZ4_decompress_safe_partial_forceExtDict(const char* source,
+                                             char*       dest,
+                                             int         compressedSize,
+                                             int         targetOutputSize,
+                                             int         dstCapacity,
+                                             const void* dictStart,
+                                             size_t      dictSize);
+#if defined(__cplusplus)
 }
 #endif
 
 /*-******************************
-*  Compression functions
-********************************/
+ *  Compression functions
+ ********************************/
 LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
 {
-    if (tableType == byU16)
-        return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
-    else
-        return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+   if (tableType == byU16)
+      return ((sequence * 2654435761U) >> ((MINMATCH * 8) - (LZ4_HASHLOG + 1)));
+   else
+      return ((sequence * 2654435761U) >> ((MINMATCH * 8) - LZ4_HASHLOG));
 }
 
 LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
 {
-    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
-    if (LZ4_isLittleEndian()) {
-        const U64 prime5bytes = 889523592379ULL;
-        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
-    } else {
-        const U64 prime8bytes = 11400714785074694791ULL;
-        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
-    }
+   const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG + 1 : LZ4_HASHLOG;
+   if (LZ4_isLittleEndian()) {
+      const U64 prime5bytes = 889523592379ULL;
+      return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+   } else {
+      const U64 prime8bytes = 11400714785074694791ULL;
+      return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+   }
 }
 
-LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType)
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p,
+                                      tableType_t const tableType)
 {
-    if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+   if ((sizeof(reg_t) == 8) && (tableType != byU16))
+      return LZ4_hash5(LZ4_read_ARCH(p), tableType);
 
 #ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT
-    return LZ4_hash4(LZ4_readLE32(p), tableType);
+   return LZ4_hash4(LZ4_readLE32(p), tableType);
 #else
-    return LZ4_hash4(LZ4_read32(p), tableType);
+   return LZ4_hash4(LZ4_read32(p), tableType);
 #endif
 }
 
-LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
-{
-    switch (tableType)
-    {
-    default: /* fallthrough */
-    case clearedTable: { /* illegal! */ assert(0); return; }
-    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; }
-    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; }
-    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; }
-    }
+LZ4_FORCE_INLINE void
+LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+   switch (tableType) {
+   default:             /* fallthrough */
+   case clearedTable: { /* illegal! */
+      assert(0);
+      return;
+   }
+   case byPtr: {
+      const BYTE** hashTable = (const BYTE**)tableBase;
+      hashTable[h]           = NULL;
+      return;
+   }
+   case byU32: {
+      U32* hashTable = (U32*)tableBase;
+      hashTable[h]   = 0;
+      return;
+   }
+   case byU16: {
+      U16* hashTable = (U16*)tableBase;
+      hashTable[h]   = 0;
+      return;
+   }
+   }
 }
 
-LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
-{
-    switch (tableType)
-    {
-    default: /* fallthrough */
-    case clearedTable: /* fallthrough */
-    case byPtr: { /* illegal! */ assert(0); return; }
-    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; }
-    case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; }
-    }
+LZ4_FORCE_INLINE void
+LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+   switch (tableType) {
+   default:           /* fallthrough */
+   case clearedTable: /* fallthrough */
+   case byPtr: {      /* illegal! */
+      assert(0);
+      return;
+   }
+   case byU32: {
+      U32* hashTable = (U32*)tableBase;
+      hashTable[h]   = idx;
+      return;
+   }
+   case byU16: {
+      U16* hashTable = (U16*)tableBase;
+      assert(idx < 65536);
+      hashTable[h] = (U16)idx;
+      return;
+   }
+   }
 }
 
 /* LZ4_putPosition*() : only used in byPtr mode */
-LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h,
-                                  void* tableBase, tableType_t const tableType)
+LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE*       p,
+                                            U32               h,
+                                            void*             tableBase,
+                                            tableType_t const tableType)
 {
-    const BYTE** const hashTable = (const BYTE**)tableBase;
-    assert(tableType == byPtr); (void)tableType;
-    hashTable[h] = p;
+   const BYTE** const hashTable = (const BYTE**)tableBase;
+   assert(tableType == byPtr);
+   (void)tableType;
+   hashTable[h] = p;
 }
 
-LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType)
+LZ4_FORCE_INLINE void
+LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType)
 {
-    U32 const h = LZ4_hashPosition(p, tableType);
-    LZ4_putPositionOnHash(p, h, tableBase, tableType);
+   U32 const h = LZ4_hashPosition(p, tableType);
+   LZ4_putPositionOnHash(p, h, tableBase, tableType);
 }
 
 /* LZ4_getIndexOnHash() :
@@ -853,75 +997,81 @@ LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_
  * Assumption 1 : only valid if tableType == byU32 or byU16.
  * Assumption 2 : h is presumed valid (within limits of hash table)
  */
-LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType)
-{
-    LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
-    if (tableType == byU32) {
-        const U32* const hashTable = (const U32*) tableBase;
-        assert(h < (1U << (LZ4_MEMORY_USAGE-2)));
-        return hashTable[h];
-    }
-    if (tableType == byU16) {
-        const U16* const hashTable = (const U16*) tableBase;
-        assert(h < (1U << (LZ4_MEMORY_USAGE-1)));
-        return hashTable[h];
-    }
-    assert(0); return 0;  /* forbidden case */
-}
-
-static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType)
-{
-    assert(tableType == byPtr); (void)tableType;
-    { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; }
+LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32         h,
+                                        const void* tableBase,
+                                        tableType_t tableType)
+{
+   LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+   if (tableType == byU32) {
+      const U32* const hashTable = (const U32*)tableBase;
+      assert(h < (1U << (LZ4_MEMORY_USAGE - 2)));
+      return hashTable[h];
+   }
+   if (tableType == byU16) {
+      const U16* const hashTable = (const U16*)tableBase;
+      assert(h < (1U << (LZ4_MEMORY_USAGE - 1)));
+      return hashTable[h];
+   }
+   assert(0);
+   return 0; /* forbidden case */
+}
+
+static const BYTE*
+LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+   assert(tableType == byPtr);
+   (void)tableType;
+   {
+      const BYTE* const* hashTable = (const BYTE* const*)tableBase;
+      return hashTable[h];
+   }
 }
 
 LZ4_FORCE_INLINE const BYTE*
-LZ4_getPosition(const BYTE* p,
-                const void* tableBase, tableType_t tableType)
-{
-    U32 const h = LZ4_hashPosition(p, tableType);
-    return LZ4_getPositionOnHash(h, tableBase, tableType);
-}
-
-LZ4_FORCE_INLINE void
-LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
-           const int inputSize,
-           const tableType_t tableType) {
-    /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
-     * therefore safe to use no matter what mode we're in. Otherwise, we figure
-     * out if it's safe to leave as is or whether it needs to be reset.
-     */
-    if ((tableType_t)cctx->tableType != clearedTable) {
-        assert(inputSize >= 0);
-        if ((tableType_t)cctx->tableType != tableType
-          || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU)
-          || ((tableType == byU32) && cctx->currentOffset > 1 GB)
-          || tableType == byPtr
-          || inputSize >= 4 KB)
-        {
-            DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", (void*)cctx);
-            MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
-            cctx->currentOffset = 0;
-            cctx->tableType = (U32)clearedTable;
-        } else {
-            DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
-        }
-    }
-
-    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
-     * is faster than compressing without a gap.
-     * However, compressing with currentOffset == 0 is faster still,
-     * so we preserve that case.
-     */
-    if (cctx->currentOffset != 0 && tableType == byU32) {
-        DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
-        cctx->currentOffset += 64 KB;
-    }
-
-    /* Finally, clear history */
-    cctx->dictCtx = NULL;
-    cctx->dictionary = NULL;
-    cctx->dictSize = 0;
+LZ4_getPosition(const BYTE* p, const void* tableBase, tableType_t tableType)
+{
+   U32 const h = LZ4_hashPosition(p, tableType);
+   return LZ4_getPositionOnHash(h, tableBase, tableType);
+}
+
+LZ4_FORCE_INLINE void LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
+                                       const int                    inputSize,
+                                       const tableType_t            tableType)
+{
+   /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+    * therefore safe to use no matter what mode we're in. Otherwise, we figure
+    * out if it's safe to leave as is or whether it needs to be reset.
+    */
+   if ((tableType_t)cctx->tableType != clearedTable) {
+      assert(inputSize >= 0);
+      if ((tableType_t)cctx->tableType != tableType ||
+          ((tableType == byU16) &&
+           cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU) ||
+          ((tableType == byU32) && cctx->currentOffset > 1 GB) ||
+          tableType == byPtr || inputSize >= 4 KB) {
+         DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", (void*)cctx);
+         MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+         cctx->currentOffset = 0;
+         cctx->tableType     = (U32)clearedTable;
+      } else {
+         DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+      }
+   }
+
+   /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
+    * is faster than compressing without a gap.
+    * However, compressing with currentOffset == 0 is faster still,
+    * so we preserve that case.
+    */
+   if (cctx->currentOffset != 0 && tableType == byU32) {
+      DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+      cctx->currentOffset += 64 KB;
+   }
+
+   /* Finally, clear history */
+   cctx->dictCtx    = NULL;
+   cctx->dictionary = NULL;
+   cctx->dictSize   = 0;
 }
 
 /** LZ4_compress_generic_validated() :
@@ -931,413 +1081,495 @@ LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
  *  - inputSize > 0
  */
 LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
-                 LZ4_stream_t_internal* const cctx,
-                 const char* const source,
-                 char* const dest,
-                 const int inputSize,
-                 int*  inputConsumed, /* only written when outputDirective == fillOutput */
-                 const int maxOutputSize,
-                 const limitedOutput_directive outputDirective,
-                 const tableType_t tableType,
-                 const dict_directive dictDirective,
-                 const dictIssue_directive dictIssue,
-                 const int acceleration)
-{
-    int result;
-    const BYTE* ip = (const BYTE*)source;
-
-    U32 const startIndex = cctx->currentOffset;
-    const BYTE* base = (const BYTE*)source - startIndex;
-    const BYTE* lowLimit;
-
-    const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx;
-    const BYTE* const dictionary =
-        dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
-    const U32 dictSize =
-        dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
-    const U32 dictDelta =
-        (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0;   /* make indexes in dictCtx comparable with indexes in current context */
-
-    int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
-    U32 const prefixIdxLimit = startIndex - dictSize;   /* used when dictDirective == dictSmall */
-    const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
-    const BYTE* anchor = (const BYTE*) source;
-    const BYTE* const iend = ip + inputSize;
-    const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
-    const BYTE* const matchlimit = iend - LASTLITERALS;
-
-    /* the dictCtx currentOffset is indexed on the start of the dictionary,
-     * while a dictionary in the current context precedes the currentOffset */
-    const BYTE* dictBase = (dictionary == NULL) ? NULL :
-                           (dictDirective == usingDictCtx) ?
-                            dictionary + dictSize - dictCtx->currentOffset :
-                            dictionary + dictSize - startIndex;
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const olimit = op + maxOutputSize;
-
-    U32 offset = 0;
-    U32 forwardH;
-
-    DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType);
-    assert(ip != NULL);
-    if (tableType == byU16) assert(inputSize<LZ4_64Klimit);  /* Size too large (not within 64K limit) */
-    if (tableType == byPtr) assert(dictDirective==noDict);   /* only supported use case with byPtr */
-    /* If init conditions are not met, we don't have to mark stream
-     * as having dirty context, since no action was taken yet */
-    if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */
-    assert(acceleration >= 1);
-
-    lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
-
-    /* Update context state */
-    if (dictDirective == usingDictCtx) {
-        /* Subsequent linked blocks can't use the dictionary. */
-        /* Instead, they use the block we just compressed. */
-        cctx->dictCtx = NULL;
-        cctx->dictSize = (U32)inputSize;
-    } else {
-        cctx->dictSize += (U32)inputSize;
-    }
-    cctx->currentOffset += (U32)inputSize;
-    cctx->tableType = (U32)tableType;
-
-    if (inputSize<LZ4_minLength) goto _last_literals;        /* Input too small, no compression (all literals) */
-
-    /* First Byte */
-    {   U32 const h = LZ4_hashPosition(ip, tableType);
-        if (tableType == byPtr) {
-            LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr);
-        } else {
-            LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType);
-    }   }
-    ip++; forwardH = LZ4_hashPosition(ip, tableType);
-
-    /* Main Loop */
-    for ( ; ; ) {
-        const BYTE* match;
-        BYTE* token;
-        const BYTE* filledIp;
-
-        /* Find a match */
-        if (tableType == byPtr) {
-            const BYTE* forwardIp = ip;
-            int step = 1;
-            int searchMatchNb = acceleration << LZ4_skipTrigger;
-            do {
-                U32 const h = forwardH;
-                ip = forwardIp;
-                forwardIp += step;
-                step = (searchMatchNb++ >> LZ4_skipTrigger);
-
-                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
-                assert(ip < mflimitPlusOne);
-
-                match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType);
-                forwardH = LZ4_hashPosition(forwardIp, tableType);
-                LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType);
-
-            } while ( (match+LZ4_DISTANCE_MAX < ip)
-                   || (LZ4_read32(match) != LZ4_read32(ip)) );
-
-        } else {   /* byU32, byU16 */
-
-            const BYTE* forwardIp = ip;
-            int step = 1;
-            int searchMatchNb = acceleration << LZ4_skipTrigger;
-            do {
-                U32 const h = forwardH;
-                U32 const current = (U32)(forwardIp - base);
-                U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
-                assert(matchIndex <= current);
-                assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
-                ip = forwardIp;
-                forwardIp += step;
-                step = (searchMatchNb++ >> LZ4_skipTrigger);
-
-                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
-                assert(ip < mflimitPlusOne);
-
-                if (dictDirective == usingDictCtx) {
-                    if (matchIndex < startIndex) {
-                        /* there was no match, try the dictionary */
-                        assert(tableType == byU32);
-                        matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
-                        match = dictBase + matchIndex;
-                        matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
-                        lowLimit = dictionary;
-                    } else {
-                        match = base + matchIndex;
-                        lowLimit = (const BYTE*)source;
-                    }
-                } else if (dictDirective == usingExtDict) {
-                    if (matchIndex < startIndex) {
-                        DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
-                        assert(startIndex - matchIndex >= MINMATCH);
-                        assert(dictBase);
-                        match = dictBase + matchIndex;
-                        lowLimit = dictionary;
-                    } else {
-                        match = base + matchIndex;
-                        lowLimit = (const BYTE*)source;
-                    }
-                } else {   /* single continuous memory segment */
-                    match = base + matchIndex;
-                }
-                forwardH = LZ4_hashPosition(forwardIp, tableType);
-                LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
-
-                DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex, current - matchIndex);
-                if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; }    /* match outside of valid area */
-                assert(matchIndex < current);
-                if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX))
-                  && (matchIndex+LZ4_DISTANCE_MAX < current)) {
-                    continue;
-                } /* too far */
-                assert((current - matchIndex) <= LZ4_DISTANCE_MAX);  /* match now expected within distance */
-
-                if (LZ4_read32(match) == LZ4_read32(ip)) {
-                    if (maybe_extMem) offset = current - matchIndex;
-                    break;   /* match found */
-                }
-
-            } while(1);
-        }
-
-        /* Catch up */
-        filledIp = ip;
-        assert(ip > anchor); /* this is always true as ip has been advanced before entering the main loop */
-        if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) {
-            do { ip--; match--; } while (((ip > anchor) & (match > lowLimit)) && (unlikely(ip[-1] == match[-1])));
-        }
-
-        /* Encode Literals */
-        {   unsigned const litLength = (unsigned)(ip - anchor);
-            token = op++;
-            if ((outputDirective == limitedOutput) &&  /* Check output buffer overflow */
-                (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) {
-                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
-            }
-            if ((outputDirective == fillOutput) &&
-                (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) {
-                op--;
-                goto _last_literals;
+   LZ4_stream_t_internal* const cctx,
+   const char* const            source,
+   char* const                  dest,
+   const int                    inputSize,
+   int* inputConsumed, /* only written when outputDirective == fillOutput */
+   const int                     maxOutputSize,
+   const limitedOutput_directive outputDirective,
+   const tableType_t             tableType,
+   const dict_directive          dictDirective,
+   const dictIssue_directive     dictIssue,
+   const int                     acceleration)
+{
+   int         result;
+   const BYTE* ip = (const BYTE*)source;
+
+   U32 const   startIndex = cctx->currentOffset;
+   const BYTE* base       = (const BYTE*)source - startIndex;
+   const BYTE* lowLimit;
+
+   const LZ4_stream_t_internal* dictCtx =
+      (const LZ4_stream_t_internal*)cctx->dictCtx;
+   const BYTE* const dictionary =
+      dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+   const U32 dictSize =
+      dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+   const U32 dictDelta = (dictDirective == usingDictCtx)
+                            ? startIndex - dictCtx->currentOffset
+                            : 0; /* make indexes in dictCtx comparable with
+                                    indexes in current context */
+
+   int const maybe_extMem =
+      (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+   U32 const prefixIdxLimit =
+      startIndex - dictSize; /* used when dictDirective == dictSmall */
+   const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+   const BYTE*       anchor  = (const BYTE*)source;
+   const BYTE* const iend    = ip + inputSize;
+   const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+   const BYTE* const matchlimit     = iend - LASTLITERALS;
+
+   /* the dictCtx currentOffset is indexed on the start of the dictionary,
+    * while a dictionary in the current context precedes the currentOffset */
+   const BYTE* dictBase = (dictionary == NULL) ? NULL
+                          : (dictDirective == usingDictCtx)
+                             ? dictionary + dictSize - dictCtx->currentOffset
+                             : dictionary + dictSize - startIndex;
+
+   BYTE*       op     = (BYTE*)dest;
+   BYTE* const olimit = op + maxOutputSize;
+
+   U32 offset = 0;
+   U32 forwardH;
+
+   DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u",
+            inputSize, tableType);
+   assert(ip != NULL);
+   if (tableType == byU16)
+      assert(inputSize <
+             LZ4_64Klimit); /* Size too large (not within 64K limit) */
+   if (tableType == byPtr)
+      assert(dictDirective == noDict); /* only supported use case with byPtr */
+   /* If init conditions are not met, we don't have to mark stream
+    * as having dirty context, since no action was taken yet */
+   if (outputDirective == fillOutput && maxOutputSize < 1) {
+      return 0;
+   } /* Impossible to store anything */
+   assert(acceleration >= 1);
+
+   lowLimit =
+      (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+   /* Update context state */
+   if (dictDirective == usingDictCtx) {
+      /* Subsequent linked blocks can't use the dictionary. */
+      /* Instead, they use the block we just compressed. */
+      cctx->dictCtx  = NULL;
+      cctx->dictSize = (U32)inputSize;
+   } else {
+      cctx->dictSize += (U32)inputSize;
+   }
+   cctx->currentOffset += (U32)inputSize;
+   cctx->tableType = (U32)tableType;
+
+   if (inputSize < LZ4_minLength)
+      goto _last_literals; /* Input too small, no compression (all literals) */
+
+   /* First Byte */
+   {
+      U32 const h = LZ4_hashPosition(ip, tableType);
+      if (tableType == byPtr) {
+         LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr);
+      } else {
+         LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType);
+      }
+   }
+   ip++;
+   forwardH = LZ4_hashPosition(ip, tableType);
+
+   /* Main Loop */
+   for (;;) {
+      const BYTE* match;
+      BYTE*       token;
+      const BYTE* filledIp;
+
+      /* Find a match */
+      if (tableType == byPtr) {
+         const BYTE* forwardIp     = ip;
+         int         step          = 1;
+         int         searchMatchNb = acceleration << LZ4_skipTrigger;
+         do {
+            U32 const h = forwardH;
+            ip          = forwardIp;
+            forwardIp += step;
+            step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+            if (unlikely(forwardIp > mflimitPlusOne))
+               goto _last_literals;
+            assert(ip < mflimitPlusOne);
+
+            match    = LZ4_getPositionOnHash(h, cctx->hashTable, tableType);
+            forwardH = LZ4_hashPosition(forwardIp, tableType);
+            LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType);
+
+         } while ((match + LZ4_DISTANCE_MAX < ip) ||
+                  (LZ4_read32(match) != LZ4_read32(ip)));
+
+      } else { /* byU32, byU16 */
+
+         const BYTE* forwardIp     = ip;
+         int         step          = 1;
+         int         searchMatchNb = acceleration << LZ4_skipTrigger;
+         do {
+            U32 const h       = forwardH;
+            U32 const current = (U32)(forwardIp - base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex <= current);
+            assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+            ip = forwardIp;
+            forwardIp += step;
+            step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+            if (unlikely(forwardIp > mflimitPlusOne))
+               goto _last_literals;
+            assert(ip < mflimitPlusOne);
+
+            if (dictDirective == usingDictCtx) {
+               if (matchIndex < startIndex) {
+                  /* there was no match, try the dictionary */
+                  assert(tableType == byU32);
+                  matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                  match      = dictBase + matchIndex;
+                  matchIndex += dictDelta; /* make dictCtx index comparable with
+                                              current context */
+                  lowLimit = dictionary;
+               } else {
+                  match    = base + matchIndex;
+                  lowLimit = (const BYTE*)source;
+               }
+            } else if (dictDirective == usingExtDict) {
+               if (matchIndex < startIndex) {
+                  DEBUGLOG(
+                     7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u",
+                     matchIndex, startIndex);
+                  assert(startIndex - matchIndex >= MINMATCH);
+                  assert(dictBase);
+                  match    = dictBase + matchIndex;
+                  lowLimit = dictionary;
+               } else {
+                  match    = base + matchIndex;
+                  lowLimit = (const BYTE*)source;
+               }
+            } else { /* single continuous memory segment */
+               match = base + matchIndex;
             }
-            if (litLength >= RUN_MASK) {
-                unsigned len = litLength - RUN_MASK;
-                *token = (RUN_MASK<<ML_BITS);
-                for(; len >= 255 ; len-=255) *op++ = 255;
-                *op++ = (BYTE)len;
+            forwardH = LZ4_hashPosition(forwardIp, tableType);
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+            DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex,
+                     current - matchIndex);
+            if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) {
+               continue;
+            } /* match outside of valid area */
+            assert(matchIndex < current);
+            if (((tableType != byU16) ||
+                 (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX)) &&
+                (matchIndex + LZ4_DISTANCE_MAX < current)) {
+               continue;
+            } /* too far */
+            assert((current - matchIndex) <=
+                   LZ4_DISTANCE_MAX); /* match now expected within distance */
+
+            if (LZ4_read32(match) == LZ4_read32(ip)) {
+               if (maybe_extMem)
+                  offset = current - matchIndex;
+               break; /* match found */
             }
-            else *token = (BYTE)(litLength<<ML_BITS);
 
-            /* Copy Literals */
-            LZ4_wildCopy8(op, anchor, op+litLength);
-            op+=litLength;
-            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
-                        (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
-        }
-
-_next_match:
-        /* at this stage, the following variables must be correctly set :
-         * - ip : at start of LZ operation
-         * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict
-         * - offset : if maybe_ext_memSegment==1 (constant)
-         * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
-         * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
-         */
-
-        if ((outputDirective == fillOutput) &&
-            (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
-            /* the match was too close to the end, rewind and go to last literals */
-            op = token;
+         } while (1);
+      }
+
+      /* Catch up */
+      filledIp = ip;
+      assert(ip > anchor); /* this is always true as ip has been advanced before
+                              entering the main loop */
+      if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) {
+         do {
+            ip--;
+            match--;
+         } while (((ip > anchor) & (match > lowLimit)) &&
+                  (unlikely(ip[-1] == match[-1])));
+      }
+
+      /* Encode Literals */
+      {
+         unsigned const litLength = (unsigned)(ip - anchor);
+         token                    = op++;
+         if ((outputDirective ==
+              limitedOutput) && /* Check output buffer overflow */
+             (unlikely(op + litLength + (2 + 1 + LASTLITERALS) +
+                          (litLength / 255) >
+                       olimit))) {
+            return 0; /* cannot compress within `dst` budget. Stored indexes in
+                         hash table are nonetheless fine */
+         }
+         if ((outputDirective == fillOutput) &&
+             (unlikely(op + (litLength + 240) / 255 /* litlen */ +
+                          litLength /* literals */ + 2 /* offset */ +
+                          1 /* token */ + MFLIMIT -
+                          MINMATCH /* min last literals so last match is <= end
+                                      - MFLIMIT */
+                       > olimit))) {
+            op--;
             goto _last_literals;
-        }
-
-        /* Encode Offset */
-        if (maybe_extMem) {   /* static test */
-            DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset, (int)(ip - (const BYTE*)source));
-            assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
-            LZ4_writeLE16(op, (U16)offset); op+=2;
-        } else  {
-            DEBUGLOG(6, "             with offset=%u  (same segment)", (U32)(ip - match));
-            assert(ip-match <= LZ4_DISTANCE_MAX);
-            LZ4_writeLE16(op, (U16)(ip - match)); op+=2;
-        }
-
-        /* Encode MatchLength */
-        {   unsigned matchCode;
-
-            if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx)
-              && (lowLimit==dictionary) /* match within extDict */ ) {
-                const BYTE* limit = ip + (dictEnd-match);
-                assert(dictEnd > match);
-                if (limit > matchlimit) limit = matchlimit;
-                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
-                ip += (size_t)matchCode + MINMATCH;
-                if (ip==limit) {
-                    unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit);
-                    matchCode += more;
-                    ip += more;
-                }
-                DEBUGLOG(6, "             with matchLength=%u starting in extDict", matchCode+MINMATCH);
+         }
+         if (litLength >= RUN_MASK) {
+            unsigned len = litLength - RUN_MASK;
+            *token       = (RUN_MASK << ML_BITS);
+            for (; len >= 255; len -= 255)
+               *op++ = 255;
+            *op++ = (BYTE)len;
+         } else
+            *token = (BYTE)(litLength << ML_BITS);
+
+         /* Copy Literals */
+         LZ4_wildCopy8(op, anchor, op + litLength);
+         op += litLength;
+         DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                  (int)(anchor - (const BYTE*)source), litLength,
+                  (int)(ip - (const BYTE*)source));
+      }
+
+   _next_match:
+      /* at this stage, the following variables must be correctly set :
+       * - ip : at start of LZ operation
+       * - match : at start of previous pattern occurrence; can be within
+       * current prefix, or within extDict
+       * - offset : if maybe_ext_memSegment==1 (constant)
+       * - lowLimit : must be == dictionary to mean "match is within extDict";
+       * must be == source otherwise
+       * - token and *token : position to write 4-bits for match length; higher
+       * 4-bits for literal length supposed already written
+       */
+
+      if ((outputDirective == fillOutput) &&
+          (op + 2 /* offset */ + 1 /* token */ + MFLIMIT -
+              MINMATCH /* min last literals so last match is <= end - MFLIMIT */
+           > olimit)) {
+         /* the match was too close to the end, rewind and go to last literals
+          */
+         op = token;
+         goto _last_literals;
+      }
+
+      /* Encode Offset */
+      if (maybe_extMem) { /* static test */
+         DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset,
+                  (int)(ip - (const BYTE*)source));
+         assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+         LZ4_writeLE16(op, (U16)offset);
+         op += 2;
+      } else {
+         DEBUGLOG(6, "             with offset=%u  (same segment)",
+                  (U32)(ip - match));
+         assert(ip - match <= LZ4_DISTANCE_MAX);
+         LZ4_writeLE16(op, (U16)(ip - match));
+         op += 2;
+      }
+
+      /* Encode MatchLength */
+      {
+         unsigned matchCode;
+
+         if ((dictDirective == usingExtDict || dictDirective == usingDictCtx) &&
+             (lowLimit == dictionary) /* match within extDict */) {
+            const BYTE* limit = ip + (dictEnd - match);
+            assert(dictEnd > match);
+            if (limit > matchlimit)
+               limit = matchlimit;
+            matchCode = LZ4_count(ip + MINMATCH, match + MINMATCH, limit);
+            ip += (size_t)matchCode + MINMATCH;
+            if (ip == limit) {
+               unsigned const more =
+                  LZ4_count(limit, (const BYTE*)source, matchlimit);
+               matchCode += more;
+               ip += more;
+            }
+            DEBUGLOG(6, "             with matchLength=%u starting in extDict",
+                     matchCode + MINMATCH);
+         } else {
+            matchCode = LZ4_count(ip + MINMATCH, match + MINMATCH, matchlimit);
+            ip += (size_t)matchCode + MINMATCH;
+            DEBUGLOG(6, "             with matchLength=%u",
+                     matchCode + MINMATCH);
+         }
+
+         if ((outputDirective) && /* Check output buffer overflow */
+             (unlikely(op + (1 + LASTLITERALS) + (matchCode + 240) / 255 >
+                       olimit))) {
+            if (outputDirective == fillOutput) {
+               /* Match description too long : reduce it */
+               U32 newMatchCode = 15 /* in token */ -
+                                  1 /* to avoid needing a zero byte */ +
+                                  ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
+               ip -= matchCode - newMatchCode;
+               assert(newMatchCode < matchCode);
+               matchCode = newMatchCode;
+               if (unlikely(ip <= filledIp)) {
+                  /* We have already filled up to filledIp so if ip ends up less
+                   * than filledIp we have positions in the hash table beyond
+                   * the current position. This is a problem if we reuse the
+                   * hash table. So we have to remove these positions from the
+                   * hash table.
+                   */
+                  const BYTE* ptr;
+                  DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                  for (ptr = ip; ptr <= filledIp; ++ptr) {
+                     U32 const h = LZ4_hashPosition(ptr, tableType);
+                     LZ4_clearHash(h, cctx->hashTable, tableType);
+                  }
+               }
             } else {
-                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
-                ip += (size_t)matchCode + MINMATCH;
-                DEBUGLOG(6, "             with matchLength=%u", matchCode+MINMATCH);
+               assert(outputDirective == limitedOutput);
+               return 0; /* cannot compress within `dst` budget. Stored indexes
+                            in hash table are nonetheless fine */
             }
-
-            if ((outputDirective) &&    /* Check output buffer overflow */
-                (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) {
-                if (outputDirective == fillOutput) {
-                    /* Match description too long : reduce it */
-                    U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
-                    ip -= matchCode - newMatchCode;
-                    assert(newMatchCode < matchCode);
-                    matchCode = newMatchCode;
-                    if (unlikely(ip <= filledIp)) {
-                        /* We have already filled up to filledIp so if ip ends up less than filledIp
-                         * we have positions in the hash table beyond the current position. This is
-                         * a problem if we reuse the hash table. So we have to remove these positions
-                         * from the hash table.
-                         */
-                        const BYTE* ptr;
-                        DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
-                        for (ptr = ip; ptr <= filledIp; ++ptr) {
-                            U32 const h = LZ4_hashPosition(ptr, tableType);
-                            LZ4_clearHash(h, cctx->hashTable, tableType);
-                        }
-                    }
-                } else {
-                    assert(outputDirective == limitedOutput);
-                    return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
-                }
+         }
+         if (matchCode >= ML_MASK) {
+            *token += ML_MASK;
+            matchCode -= ML_MASK;
+            LZ4_write32(op, 0xFFFFFFFF);
+            while (matchCode >= 4 * 255) {
+               op += 4;
+               LZ4_write32(op, 0xFFFFFFFF);
+               matchCode -= 4 * 255;
             }
-            if (matchCode >= ML_MASK) {
-                *token += ML_MASK;
-                matchCode -= ML_MASK;
-                LZ4_write32(op, 0xFFFFFFFF);
-                while (matchCode >= 4*255) {
-                    op+=4;
-                    LZ4_write32(op, 0xFFFFFFFF);
-                    matchCode -= 4*255;
-                }
-                op += matchCode / 255;
-                *op++ = (BYTE)(matchCode % 255);
-            } else
-                *token += (BYTE)(matchCode);
-        }
-        /* Ensure we have enough space for the last literals. */
-        assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
-
-        anchor = ip;
-
-        /* Test end of chunk */
-        if (ip >= mflimitPlusOne) break;
-
-        /* Fill table */
-        {   U32 const h = LZ4_hashPosition(ip-2, tableType);
-            if (tableType == byPtr) {
-                LZ4_putPositionOnHash(ip-2, h, cctx->hashTable, byPtr);
+            op += matchCode / 255;
+            *op++ = (BYTE)(matchCode % 255);
+         } else
+            *token += (BYTE)(matchCode);
+      }
+      /* Ensure we have enough space for the last literals. */
+      assert(
+         !(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
+
+      anchor = ip;
+
+      /* Test end of chunk */
+      if (ip >= mflimitPlusOne)
+         break;
+
+      /* Fill table */
+      {
+         U32 const h = LZ4_hashPosition(ip - 2, tableType);
+         if (tableType == byPtr) {
+            LZ4_putPositionOnHash(ip - 2, h, cctx->hashTable, byPtr);
+         } else {
+            U32 const idx = (U32)((ip - 2) - base);
+            LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType);
+         }
+      }
+
+      /* Test next position */
+      if (tableType == byPtr) {
+
+         match = LZ4_getPosition(ip, cctx->hashTable, tableType);
+         LZ4_putPosition(ip, cctx->hashTable, tableType);
+         if ((match + LZ4_DISTANCE_MAX >= ip) &&
+             (LZ4_read32(match) == LZ4_read32(ip))) {
+            token  = op++;
+            *token = 0;
+            goto _next_match;
+         }
+
+      } else { /* byU32, byU16 */
+
+         U32 const h       = LZ4_hashPosition(ip, tableType);
+         U32 const current = (U32)(ip - base);
+         U32 matchIndex    = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+         assert(matchIndex < current);
+         if (dictDirective == usingDictCtx) {
+            if (matchIndex < startIndex) {
+               /* there was no match, try the dictionary */
+               assert(tableType == byU32);
+               matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+               match      = dictBase + matchIndex;
+               lowLimit   = dictionary; /* required for match length counter */
+               matchIndex += dictDelta;
             } else {
-                U32 const idx = (U32)((ip-2) - base);
-                LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType);
-        }   }
-
-        /* Test next position */
-        if (tableType == byPtr) {
-
-            match = LZ4_getPosition(ip, cctx->hashTable, tableType);
-            LZ4_putPosition(ip, cctx->hashTable, tableType);
-            if ( (match+LZ4_DISTANCE_MAX >= ip)
-              && (LZ4_read32(match) == LZ4_read32(ip)) )
-            { token=op++; *token=0; goto _next_match; }
-
-        } else {   /* byU32, byU16 */
-
-            U32 const h = LZ4_hashPosition(ip, tableType);
-            U32 const current = (U32)(ip-base);
-            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
-            assert(matchIndex < current);
-            if (dictDirective == usingDictCtx) {
-                if (matchIndex < startIndex) {
-                    /* there was no match, try the dictionary */
-                    assert(tableType == byU32);
-                    matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
-                    match = dictBase + matchIndex;
-                    lowLimit = dictionary;   /* required for match length counter */
-                    matchIndex += dictDelta;
-                } else {
-                    match = base + matchIndex;
-                    lowLimit = (const BYTE*)source;  /* required for match length counter */
-                }
-            } else if (dictDirective==usingExtDict) {
-                if (matchIndex < startIndex) {
-                    assert(dictBase);
-                    match = dictBase + matchIndex;
-                    lowLimit = dictionary;   /* required for match length counter */
-                } else {
-                    match = base + matchIndex;
-                    lowLimit = (const BYTE*)source;   /* required for match length counter */
-                }
-            } else {   /* single memory segment */
-                match = base + matchIndex;
+               match = base + matchIndex;
+               lowLimit =
+                  (const BYTE*)source; /* required for match length counter */
             }
-            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
-            assert(matchIndex < current);
-            if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1)
-              && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current))
-              && (LZ4_read32(match) == LZ4_read32(ip)) ) {
-                token=op++;
-                *token=0;
-                if (maybe_extMem) offset = current - matchIndex;
-                DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
-                            (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source));
-                goto _next_match;
+         } else if (dictDirective == usingExtDict) {
+            if (matchIndex < startIndex) {
+               assert(dictBase);
+               match    = dictBase + matchIndex;
+               lowLimit = dictionary; /* required for match length counter */
+            } else {
+               match = base + matchIndex;
+               lowLimit =
+                  (const BYTE*)source; /* required for match length counter */
             }
-        }
-
-        /* Prepare next loop */
-        forwardH = LZ4_hashPosition(++ip, tableType);
+         } else { /* single memory segment */
+            match = base + matchIndex;
+         }
+         LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+         assert(matchIndex < current);
+         if (((dictIssue == dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) &&
+             (((tableType == byU16) &&
+               (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX))
+                 ? 1
+                 : (matchIndex + LZ4_DISTANCE_MAX >= current)) &&
+             (LZ4_read32(match) == LZ4_read32(ip))) {
+            token  = op++;
+            *token = 0;
+            if (maybe_extMem)
+               offset = current - matchIndex;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                     (int)(anchor - (const BYTE*)source), 0,
+                     (int)(ip - (const BYTE*)source));
+            goto _next_match;
+         }
+      }
 
-    }
+      /* Prepare next loop */
+      forwardH = LZ4_hashPosition(++ip, tableType);
+   }
 
 _last_literals:
-    /* Encode Last Literals */
-    {   size_t lastRun = (size_t)(iend - anchor);
-        if ( (outputDirective) &&  /* Check output buffer overflow */
-            (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) {
-            if (outputDirective == fillOutput) {
-                /* adapt lastRun to fill 'dst' */
-                assert(olimit >= op);
-                lastRun  = (size_t)(olimit-op) - 1/*token*/;
-                lastRun -= (lastRun + 256 - RUN_MASK) / 256;  /*additional length tokens*/
-            } else {
-                assert(outputDirective == limitedOutput);
-                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
-            }
-        }
-        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
-        if (lastRun >= RUN_MASK) {
-            size_t accumulator = lastRun - RUN_MASK;
-            *op++ = RUN_MASK << ML_BITS;
-            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
-            *op++ = (BYTE) accumulator;
-        } else {
-            *op++ = (BYTE)(lastRun<<ML_BITS);
-        }
-        LZ4_memcpy(op, anchor, lastRun);
-        ip = anchor + lastRun;
-        op += lastRun;
-    }
-
-    if (outputDirective == fillOutput) {
-        *inputConsumed = (int) (((const char*)ip)-source);
-    }
-    result = (int)(((char*)op) - dest);
-    assert(result > 0);
-    DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result);
-    return result;
+   /* Encode Last Literals */
+   {
+      size_t lastRun = (size_t)(iend - anchor);
+      if ((outputDirective) && /* Check output buffer overflow */
+          (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > olimit)) {
+         if (outputDirective == fillOutput) {
+            /* adapt lastRun to fill 'dst' */
+            assert(olimit >= op);
+            lastRun = (size_t)(olimit - op) - 1 /*token*/;
+            lastRun -=
+               (lastRun + 256 - RUN_MASK) / 256; /*additional length tokens*/
+         } else {
+            assert(outputDirective == limitedOutput);
+            return 0; /* cannot compress within `dst` budget. Stored indexes in
+                         hash table are nonetheless fine */
+         }
+      }
+      DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
+      if (lastRun >= RUN_MASK) {
+         size_t accumulator = lastRun - RUN_MASK;
+         *op++              = RUN_MASK << ML_BITS;
+         for (; accumulator >= 255; accumulator -= 255)
+            *op++ = 255;
+         *op++ = (BYTE)accumulator;
+      } else {
+         *op++ = (BYTE)(lastRun << ML_BITS);
+      }
+      LZ4_memcpy(op, anchor, lastRun);
+      ip = anchor + lastRun;
+      op += lastRun;
+   }
+
+   if (outputDirective == fillOutput) {
+      *inputConsumed = (int)(((const char*)ip) - source);
+   }
+   result = (int)(((char*)op) - dest);
+   assert(result > 0);
+   DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes",
+            inputSize, result);
+   return result;
 }
 
 /** LZ4_compress_generic() :
@@ -1345,64 +1577,89 @@ LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
  *  takes care of src == (NULL, 0)
  *  and forward the rest to LZ4_compress_generic_validated */
 LZ4_FORCE_INLINE int LZ4_compress_generic(
-                 LZ4_stream_t_internal* const cctx,
-                 const char* const src,
-                 char* const dst,
-                 const int srcSize,
-                 int *inputConsumed, /* only written when outputDirective == fillOutput */
-                 const int dstCapacity,
-                 const limitedOutput_directive outputDirective,
-                 const tableType_t tableType,
-                 const dict_directive dictDirective,
-                 const dictIssue_directive dictIssue,
-                 const int acceleration)
-{
-    DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i",
-                srcSize, dstCapacity);
-
-    if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; }  /* Unsupported srcSize, too large (or negative) */
-    if (srcSize == 0) {   /* src == NULL supported if srcSize == 0 */
-        if (outputDirective != notLimited && dstCapacity <= 0) return 0;  /* no output, can't write anything */
-        DEBUGLOG(5, "Generating an empty block");
-        assert(outputDirective == notLimited || dstCapacity >= 1);
-        assert(dst != NULL);
-        dst[0] = 0;
-        if (outputDirective == fillOutput) {
-            assert (inputConsumed != NULL);
-            *inputConsumed = 0;
-        }
-        return 1;
-    }
-    assert(src != NULL);
-
-    return LZ4_compress_generic_validated(cctx, src, dst, srcSize,
-                inputConsumed, /* only written into if outputDirective == fillOutput */
-                dstCapacity, outputDirective,
-                tableType, dictDirective, dictIssue, acceleration);
-}
-
-
-int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
-{
-    LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse;
-    assert(ctx != NULL);
-    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
-    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
-    if (maxOutputSize >= LZ4_compressBound(inputSize)) {
-        if (inputSize < LZ4_64Klimit) {
-            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
-        } else {
-            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
-            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
-        }
-    } else {
-        if (inputSize < LZ4_64Klimit) {
-            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
-        } else {
-            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
-            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration);
-        }
-    }
+   LZ4_stream_t_internal* const cctx,
+   const char* const            src,
+   char* const                  dst,
+   const int                    srcSize,
+   int* inputConsumed, /* only written when outputDirective == fillOutput */
+   const int                     dstCapacity,
+   const limitedOutput_directive outputDirective,
+   const tableType_t             tableType,
+   const dict_directive          dictDirective,
+   const dictIssue_directive     dictIssue,
+   const int                     acceleration)
+{
+   DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i", srcSize,
+            dstCapacity);
+
+   if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) {
+      return 0;
+   }                   /* Unsupported srcSize, too large (or negative) */
+   if (srcSize == 0) { /* src == NULL supported if srcSize == 0 */
+      if (outputDirective != notLimited && dstCapacity <= 0)
+         return 0; /* no output, can't write anything */
+      DEBUGLOG(5, "Generating an empty block");
+      assert(outputDirective == notLimited || dstCapacity >= 1);
+      assert(dst != NULL);
+      dst[0] = 0;
+      if (outputDirective == fillOutput) {
+         assert(inputConsumed != NULL);
+         *inputConsumed = 0;
+      }
+      return 1;
+   }
+   assert(src != NULL);
+
+   return LZ4_compress_generic_validated(
+      cctx, src, dst, srcSize,
+      inputConsumed, /* only written into if outputDirective == fillOutput */
+      dstCapacity, outputDirective, tableType, dictDirective, dictIssue,
+      acceleration);
+}
+
+int LZ4_compress_fast_extState(void*       state,
+                               const char* source,
+                               char*       dest,
+                               int         inputSize,
+                               int         maxOutputSize,
+                               int         acceleration)
+{
+   LZ4_stream_t_internal* const ctx =
+      &LZ4_initStream(state, sizeof(LZ4_stream_t))->internal_donotuse;
+   assert(ctx != NULL);
+   if (acceleration < 1)
+      acceleration = LZ4_ACCELERATION_DEFAULT;
+   if (acceleration > LZ4_ACCELERATION_MAX)
+      acceleration = LZ4_ACCELERATION_MAX;
+   if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+      if (inputSize < LZ4_64Klimit) {
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0,
+                                     notLimited, byU16, noDict, noDictIssue,
+                                     acceleration);
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)source > LZ4_DISTANCE_MAX))
+               ? byPtr
+               : byU32;
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0,
+                                     notLimited, tableType, noDict, noDictIssue,
+                                     acceleration);
+      }
+   } else {
+      if (inputSize < LZ4_64Klimit) {
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, byU16,
+                                     noDict, noDictIssue, acceleration);
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)source > LZ4_DISTANCE_MAX))
+               ? byPtr
+               : byU32;
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, tableType,
+                                     noDict, noDictIssue, acceleration);
+      }
+   }
 }
 
 /**
@@ -1414,430 +1671,551 @@ int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int
  * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
  * "correctly initialized").
  */
-int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
-{
-    LZ4_stream_t_internal* const ctx = &((LZ4_stream_t*)state)->internal_donotuse;
-    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
-    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
-    assert(ctx != NULL);
-
-    if (dstCapacity >= LZ4_compressBound(srcSize)) {
-        if (srcSize < LZ4_64Klimit) {
-            const tableType_t tableType = byU16;
-            LZ4_prepareTable(ctx, srcSize, tableType);
-            if (ctx->currentOffset) {
-                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration);
-            } else {
-                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
-            }
-        } else {
-            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
-            LZ4_prepareTable(ctx, srcSize, tableType);
-            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
-        }
-    } else {
-        if (srcSize < LZ4_64Klimit) {
-            const tableType_t tableType = byU16;
-            LZ4_prepareTable(ctx, srcSize, tableType);
-            if (ctx->currentOffset) {
-                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration);
-            } else {
-                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
-            }
-        } else {
-            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
-            LZ4_prepareTable(ctx, srcSize, tableType);
-            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
-        }
-    }
-}
-
-
-int LZ4_compress_fast(const char* src, char* dest, int srcSize, int dstCapacity, int acceleration)
-{
-    int result;
+int LZ4_compress_fast_extState_fastReset(void*       state,
+                                         const char* src,
+                                         char*       dst,
+                                         int         srcSize,
+                                         int         dstCapacity,
+                                         int         acceleration)
+{
+   LZ4_stream_t_internal* const ctx =
+      &((LZ4_stream_t*)state)->internal_donotuse;
+   if (acceleration < 1)
+      acceleration = LZ4_ACCELERATION_DEFAULT;
+   if (acceleration > LZ4_ACCELERATION_MAX)
+      acceleration = LZ4_ACCELERATION_MAX;
+   assert(ctx != NULL);
+
+   if (dstCapacity >= LZ4_compressBound(srcSize)) {
+      if (srcSize < LZ4_64Klimit) {
+         const tableType_t tableType = byU16;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         if (ctx->currentOffset) {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0,
+                                        notLimited, tableType, noDict,
+                                        dictSmall, acceleration);
+         } else {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0,
+                                        notLimited, tableType, noDict,
+                                        noDictIssue, acceleration);
+         }
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr
+                                                                        : byU32;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0,
+                                     notLimited, tableType, noDict, noDictIssue,
+                                     acceleration);
+      }
+   } else {
+      if (srcSize < LZ4_64Klimit) {
+         const tableType_t tableType = byU16;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         if (ctx->currentOffset) {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL,
+                                        dstCapacity, limitedOutput, tableType,
+                                        noDict, dictSmall, acceleration);
+         } else {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL,
+                                        dstCapacity, limitedOutput, tableType,
+                                        noDict, noDictIssue, acceleration);
+         }
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr
+                                                                        : byU32;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity,
+                                     limitedOutput, tableType, noDict,
+                                     noDictIssue, acceleration);
+      }
+   }
+}
+
+int LZ4_compress_fast(
+   const char* src, char* dest, int srcSize, int dstCapacity, int acceleration)
+{
+   int result;
 #if (LZ4_HEAPMODE)
-    LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
-    if (ctxPtr == NULL) return 0;
+   LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(
+      sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */
+   if (ctxPtr == NULL)
+      return 0;
 #else
-    LZ4_stream_t ctx;
-    LZ4_stream_t* const ctxPtr = &ctx;
+   LZ4_stream_t        ctx;
+   LZ4_stream_t* const ctxPtr = &ctx;
 #endif
-    result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity, acceleration);
+   result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity,
+                                       acceleration);
 
 #if (LZ4_HEAPMODE)
-    FREEMEM(ctxPtr);
+   FREEMEM(ctxPtr);
 #endif
-    return result;
+   return result;
 }
 
-
-int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity)
+int LZ4_compress_default(const char* src,
+                         char*       dst,
+                         int         srcSize,
+                         int         dstCapacity)
 {
-    return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1);
+   return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1);
 }
 
-
 /* Note!: This function leaves the stream in an unclean/broken state!
  * It is not safe to subsequently use the same state with a _fastReset() or
  * _continue() call without resetting it. */
-static int LZ4_compress_destSize_extState_internal(LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration)
-{
-    void* const s = LZ4_initStream(state, sizeof (*state));
-    assert(s != NULL); (void)s;
-
-    if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {  /* compression success is guaranteed */
-        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, acceleration);
-    } else {
-        if (*srcSizePtr < LZ4_64Klimit) {
-            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, acceleration);
-        } else {
-            tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
-            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, acceleration);
-    }   }
-}
-
-int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration)
-{
-    int const r = LZ4_compress_destSize_extState_internal((LZ4_stream_t*)state, src, dst, srcSizePtr, targetDstSize, acceleration);
-    /* clean the state on exit */
-    LZ4_initStream(state, sizeof (LZ4_stream_t));
-    return r;
-}
-
-
-int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+static int LZ4_compress_destSize_extState_internal(LZ4_stream_t* state,
+                                                   const char*   src,
+                                                   char*         dst,
+                                                   int*          srcSizePtr,
+                                                   int           targetDstSize,
+                                                   int           acceleration)
+{
+   void* const s = LZ4_initStream(state, sizeof(*state));
+   assert(s != NULL);
+   (void)s;
+
+   if (targetDstSize >=
+       LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */
+      return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr,
+                                        targetDstSize, acceleration);
+   } else {
+      if (*srcSizePtr < LZ4_64Klimit) {
+         return LZ4_compress_generic(&state->internal_donotuse, src, dst,
+                                     *srcSizePtr, srcSizePtr, targetDstSize,
+                                     fillOutput, byU16, noDict, noDictIssue,
+                                     acceleration);
+      } else {
+         tableType_t const addrMode =
+            ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr
+                                                                        : byU32;
+         return LZ4_compress_generic(&state->internal_donotuse, src, dst,
+                                     *srcSizePtr, srcSizePtr, targetDstSize,
+                                     fillOutput, addrMode, noDict, noDictIssue,
+                                     acceleration);
+      }
+   }
+}
+
+int LZ4_compress_destSize_extState(void*       state,
+                                   const char* src,
+                                   char*       dst,
+                                   int*        srcSizePtr,
+                                   int         targetDstSize,
+                                   int         acceleration)
+{
+   int const r = LZ4_compress_destSize_extState_internal(
+      (LZ4_stream_t*)state, src, dst, srcSizePtr, targetDstSize, acceleration);
+   /* clean the state on exit */
+   LZ4_initStream(state, sizeof(LZ4_stream_t));
+   return r;
+}
+
+int LZ4_compress_destSize(const char* src,
+                          char*       dst,
+                          int*        srcSizePtr,
+                          int         targetDstSize)
 {
 #if (LZ4_HEAPMODE)
-    LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
-    if (ctx == NULL) return 0;
+   LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(
+      sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */
+   if (ctx == NULL)
+      return 0;
 #else
-    LZ4_stream_t ctxBody;
-    LZ4_stream_t* const ctx = &ctxBody;
+   LZ4_stream_t        ctxBody;
+   LZ4_stream_t* const ctx = &ctxBody;
 #endif
 
-    int result = LZ4_compress_destSize_extState_internal(ctx, src, dst, srcSizePtr, targetDstSize, 1);
+   int result = LZ4_compress_destSize_extState_internal(
+      ctx, src, dst, srcSizePtr, targetDstSize, 1);
 
 #if (LZ4_HEAPMODE)
-    FREEMEM(ctx);
+   FREEMEM(ctx);
 #endif
-    return result;
+   return result;
 }
 
-
-
 /*-******************************
-*  Streaming functions
-********************************/
+ *  Streaming functions
+ ********************************/
 
 #if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
 LZ4_stream_t* LZ4_createStream(void)
 {
-    LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
-    LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
-    DEBUGLOG(4, "LZ4_createStream %p", (void*)lz4s);
-    if (lz4s == NULL) return NULL;
-    LZ4_initStream(lz4s, sizeof(*lz4s));
-    return lz4s;
+   LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+   LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
+   DEBUGLOG(4, "LZ4_createStream %p", (void*)lz4s);
+   if (lz4s == NULL)
+      return NULL;
+   LZ4_initStream(lz4s, sizeof(*lz4s));
+   return lz4s;
 }
 #endif
 
 static size_t LZ4_stream_t_alignment(void)
 {
 #if LZ4_ALIGN_TEST
-    typedef struct { char c; LZ4_stream_t t; } t_a;
-    return sizeof(t_a) - sizeof(LZ4_stream_t);
+   typedef struct {
+      char         c;
+      LZ4_stream_t t;
+   } t_a;
+   return sizeof(t_a) - sizeof(LZ4_stream_t);
 #else
-    return 1;  /* effectively disabled */
+   return 1; /* effectively disabled */
 #endif
 }
 
-LZ4_stream_t* LZ4_initStream (void* buffer, size_t size)
+LZ4_stream_t* LZ4_initStream(void* buffer, size_t size)
 {
-    DEBUGLOG(5, "LZ4_initStream");
-    if (buffer == NULL) { return NULL; }
-    if (size < sizeof(LZ4_stream_t)) { return NULL; }
-    if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL;
-    MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
-    return (LZ4_stream_t*)buffer;
+   DEBUGLOG(5, "LZ4_initStream");
+   if (buffer == NULL) {
+      return NULL;
+   }
+   if (size < sizeof(LZ4_stream_t)) {
+      return NULL;
+   }
+   if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment()))
+      return NULL;
+   MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
+   return (LZ4_stream_t*)buffer;
 }
 
 /* resetStream is now deprecated,
  * prefer initStream() which is more general */
-void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+void LZ4_resetStream(LZ4_stream_t* LZ4_stream)
 {
-    DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", (void*)LZ4_stream);
-    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+   DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", (void*)LZ4_stream);
+   MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
 }
 
-void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
-    LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+void LZ4_resetStream_fast(LZ4_stream_t* ctx)
+{
+   LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
 }
 
 #if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
-int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+int LZ4_freeStream(LZ4_stream_t* LZ4_stream)
 {
-    if (!LZ4_stream) return 0;   /* support free on NULL */
-    DEBUGLOG(5, "LZ4_freeStream %p", (void*)LZ4_stream);
-    FREEMEM(LZ4_stream);
-    return (0);
+   if (!LZ4_stream)
+      return 0; /* support free on NULL */
+   DEBUGLOG(5, "LZ4_freeStream %p", (void*)LZ4_stream);
+   FREEMEM(LZ4_stream);
+   return (0);
 }
 #endif
 
-
 typedef enum { _ld_fast, _ld_slow } LoadDict_mode_e;
 #define HASH_UNIT sizeof(reg_t)
-int LZ4_loadDict_internal(LZ4_stream_t* LZ4_dict,
-                    const char* dictionary, int dictSize,
-                    LoadDict_mode_e _ld)
-{
-    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
-    const tableType_t tableType = byU32;
-    const BYTE* p = (const BYTE*)dictionary;
-    const BYTE* const dictEnd = p + dictSize;
-    U32 idx32;
-
-    DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, (void*)dictionary, (void*)LZ4_dict);
-
-    /* It's necessary to reset the context,
-     * and not just continue it with prepareTable()
-     * to avoid any risk of generating overflowing matchIndex
-     * when compressing using this dictionary */
-    LZ4_resetStream(LZ4_dict);
-
-    /* We always increment the offset by 64 KB, since, if the dict is longer,
-     * we truncate it to the last 64k, and if it's shorter, we still want to
-     * advance by a whole window length so we can provide the guarantee that
-     * there are only valid offsets in the window, which allows an optimization
-     * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
-     * dictionary isn't a full 64k. */
-    dict->currentOffset += 64 KB;
-
-    if (dictSize < (int)HASH_UNIT) {
-        return 0;
-    }
-
-    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
-    dict->dictionary = p;
-    dict->dictSize = (U32)(dictEnd - p);
-    dict->tableType = (U32)tableType;
-    idx32 = dict->currentOffset - dict->dictSize;
-
-    while (p <= dictEnd-HASH_UNIT) {
-        U32 const h = LZ4_hashPosition(p, tableType);
-        /* Note: overwriting => favors positions end of dictionary */
-        LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
-        p+=3; idx32+=3;
-    }
-
-    if (_ld == _ld_slow) {
-        /* Fill hash table with additional references, to improve compression capability */
-        p = dict->dictionary;
-        idx32 = dict->currentOffset - dict->dictSize;
-        while (p <= dictEnd-HASH_UNIT) {
-            U32 const h = LZ4_hashPosition(p, tableType);
-            U32 const limit = dict->currentOffset - 64 KB;
-            if (LZ4_getIndexOnHash(h, dict->hashTable, tableType) <= limit) {
-                /* Note: not overwriting => favors positions beginning of dictionary */
-                LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
-            }
-            p++; idx32++;
-        }
-    }
+int LZ4_loadDict_internal(LZ4_stream_t*   LZ4_dict,
+                          const char*     dictionary,
+                          int             dictSize,
+                          LoadDict_mode_e _ld)
+{
+   LZ4_stream_t_internal* const dict      = &LZ4_dict->internal_donotuse;
+   const tableType_t            tableType = byU32;
+   const BYTE*                  p         = (const BYTE*)dictionary;
+   const BYTE* const            dictEnd   = p + dictSize;
+   U32                          idx32;
+
+   DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize,
+            (void*)dictionary, (void*)LZ4_dict);
+
+   /* It's necessary to reset the context,
+    * and not just continue it with prepareTable()
+    * to avoid any risk of generating overflowing matchIndex
+    * when compressing using this dictionary */
+   LZ4_resetStream(LZ4_dict);
+
+   /* We always increment the offset by 64 KB, since, if the dict is longer,
+    * we truncate it to the last 64k, and if it's shorter, we still want to
+    * advance by a whole window length so we can provide the guarantee that
+    * there are only valid offsets in the window, which allows an optimization
+    * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+    * dictionary isn't a full 64k. */
+   dict->currentOffset += 64 KB;
+
+   if (dictSize < (int)HASH_UNIT) {
+      return 0;
+   }
+
+   if ((dictEnd - p) > 64 KB)
+      p = dictEnd - 64 KB;
+   dict->dictionary = p;
+   dict->dictSize   = (U32)(dictEnd - p);
+   dict->tableType  = (U32)tableType;
+   idx32            = dict->currentOffset - dict->dictSize;
+
+   while (p <= dictEnd - HASH_UNIT) {
+      U32 const h = LZ4_hashPosition(p, tableType);
+      /* Note: overwriting => favors positions end of dictionary */
+      LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+      p += 3;
+      idx32 += 3;
+   }
+
+   if (_ld == _ld_slow) {
+      /* Fill hash table with additional references, to improve compression
+       * capability */
+      p     = dict->dictionary;
+      idx32 = dict->currentOffset - dict->dictSize;
+      while (p <= dictEnd - HASH_UNIT) {
+         U32 const h     = LZ4_hashPosition(p, tableType);
+         U32 const limit = dict->currentOffset - 64 KB;
+         if (LZ4_getIndexOnHash(h, dict->hashTable, tableType) <= limit) {
+            /* Note: not overwriting => favors positions beginning of dictionary
+             */
+            LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+         }
+         p++;
+         idx32++;
+      }
+   }
 
-    return (int)dict->dictSize;
+   return (int)dict->dictSize;
 }
 
 int LZ4_loadDict(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
 {
-    return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_fast);
+   return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_fast);
 }
 
-int LZ4_loadDictSlow(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+int LZ4_loadDictSlow(LZ4_stream_t* LZ4_dict,
+                     const char*   dictionary,
+                     int           dictSize)
 {
-    return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_slow);
+   return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_slow);
 }
 
-void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream)
+void LZ4_attach_dictionary(LZ4_stream_t*       workingStream,
+                           const LZ4_stream_t* dictionaryStream)
 {
-    const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL :
-        &(dictionaryStream->internal_donotuse);
+   const LZ4_stream_t_internal* dictCtx =
+      (dictionaryStream == NULL) ? NULL
+                                 : &(dictionaryStream->internal_donotuse);
 
-    DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)",
-             (void*)workingStream, (void*)dictionaryStream,
-             dictCtx != NULL ? dictCtx->dictSize : 0);
+   DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)", (void*)workingStream,
+            (void*)dictionaryStream, dictCtx != NULL ? dictCtx->dictSize : 0);
 
-    if (dictCtx != NULL) {
-        /* If the current offset is zero, we will never look in the
-         * external dictionary context, since there is no value a table
-         * entry can take that indicate a miss. In that case, we need
-         * to bump the offset to something non-zero.
-         */
-        if (workingStream->internal_donotuse.currentOffset == 0) {
-            workingStream->internal_donotuse.currentOffset = 64 KB;
-        }
+   if (dictCtx != NULL) {
+      /* If the current offset is zero, we will never look in the
+       * external dictionary context, since there is no value a table
+       * entry can take that indicate a miss. In that case, we need
+       * to bump the offset to something non-zero.
+       */
+      if (workingStream->internal_donotuse.currentOffset == 0) {
+         workingStream->internal_donotuse.currentOffset = 64 KB;
+      }
 
-        /* Don't actually attach an empty dictionary.
-         */
-        if (dictCtx->dictSize == 0) {
-            dictCtx = NULL;
-        }
-    }
-    workingStream->internal_donotuse.dictCtx = dictCtx;
+      /* Don't actually attach an empty dictionary.
+       */
+      if (dictCtx->dictSize == 0) {
+         dictCtx = NULL;
+      }
+   }
+   workingStream->internal_donotuse.dictCtx = dictCtx;
 }
 
-
 static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
 {
-    assert(nextSize >= 0);
-    if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) {   /* potential ptrdiff_t overflow (32-bits mode) */
-        /* rescale hash table */
-        U32 const delta = LZ4_dict->currentOffset - 64 KB;
-        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
-        int i;
-        DEBUGLOG(4, "LZ4_renormDictT");
-        for (i=0; i<LZ4_HASH_SIZE_U32; i++) {
-            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
-            else LZ4_dict->hashTable[i] -= delta;
-        }
-        LZ4_dict->currentOffset = 64 KB;
-        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
-        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
-    }
-}
-
-
-int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
-                                const char* source, char* dest,
-                                int inputSize, int maxOutputSize,
-                                int acceleration)
-{
-    const tableType_t tableType = byU32;
-    LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
-    const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL;
-
-    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize);
-
-    LZ4_renormDictT(streamPtr, inputSize);   /* fix index overflow */
-    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
-    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
-
-    /* invalidate tiny dictionaries */
-    if ( (streamPtr->dictSize < 4)     /* tiny dictionary : not enough for a hash */
-      && (dictEnd != source)           /* prefix mode */
-      && (inputSize > 0)               /* tolerance : don't lose history, in case next invocation would use prefix mode */
-      && (streamPtr->dictCtx == NULL)  /* usingDictCtx */
-      ) {
-        DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, (void*)streamPtr->dictionary);
-        /* remove dictionary existence from history, to employ faster prefix mode */
-        streamPtr->dictSize = 0;
-        streamPtr->dictionary = (const BYTE*)source;
-        dictEnd = source;
-    }
-
-    /* Check overlapping input/dictionary space */
-    {   const char* const sourceEnd = source + inputSize;
-        if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) {
-            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
-            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
-            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
-            streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
-        }
-    }
-
-    /* prefix mode : source data follows dictionary */
-    if (dictEnd == source) {
-        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
-            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
-        else
-            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
-    }
-
-    /* external dictionary mode */
-    {   int result;
-        if (streamPtr->dictCtx) {
-            /* We depend here on the fact that dictCtx'es (produced by
-             * LZ4_loadDict) guarantee that their tables contain no references
-             * to offsets between dictCtx->currentOffset - 64 KB and
-             * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
-             * to use noDictIssue even when the dict isn't a full 64 KB.
+   assert(nextSize >= 0);
+   if (LZ4_dict->currentOffset + (unsigned)nextSize >
+       0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */
+      /* rescale hash table */
+      U32 const   delta   = LZ4_dict->currentOffset - 64 KB;
+      const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+      int         i;
+      DEBUGLOG(4, "LZ4_renormDictT");
+      for (i = 0; i < LZ4_HASH_SIZE_U32; i++) {
+         if (LZ4_dict->hashTable[i] < delta)
+            LZ4_dict->hashTable[i] = 0;
+         else
+            LZ4_dict->hashTable[i] -= delta;
+      }
+      LZ4_dict->currentOffset = 64 KB;
+      if (LZ4_dict->dictSize > 64 KB)
+         LZ4_dict->dictSize = 64 KB;
+      LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+   }
+}
+
+int LZ4_compress_fast_continue(LZ4_stream_t* LZ4_stream,
+                               const char*   source,
+                               char*         dest,
+                               int           inputSize,
+                               int           maxOutputSize,
+                               int           acceleration)
+{
+   const tableType_t            tableType = byU32;
+   LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
+   const char*                  dictEnd =
+      streamPtr->dictSize
+                          ? (const char*)streamPtr->dictionary + streamPtr->dictSize
+                          : NULL;
+
+   DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)",
+            inputSize, streamPtr->dictSize);
+
+   LZ4_renormDictT(streamPtr, inputSize); /* fix index overflow */
+   if (acceleration < 1)
+      acceleration = LZ4_ACCELERATION_DEFAULT;
+   if (acceleration > LZ4_ACCELERATION_MAX)
+      acceleration = LZ4_ACCELERATION_MAX;
+
+   /* invalidate tiny dictionaries */
+   if ((streamPtr->dictSize < 4) /* tiny dictionary : not enough for a hash */
+       && (dictEnd != source)    /* prefix mode */
+       && (inputSize > 0)        /* tolerance : don't lose history, in case next
+                                    invocation would use prefix mode */
+       && (streamPtr->dictCtx == NULL) /* usingDictCtx */
+   ) {
+      DEBUGLOG(
+         5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small",
+         streamPtr->dictSize, (void*)streamPtr->dictionary);
+      /* remove dictionary existence from history, to employ faster prefix mode
+       */
+      streamPtr->dictSize   = 0;
+      streamPtr->dictionary = (const BYTE*)source;
+      dictEnd               = source;
+   }
+
+   /* Check overlapping input/dictionary space */
+   {
+      const char* const sourceEnd = source + inputSize;
+      if ((sourceEnd > (const char*)streamPtr->dictionary) &&
+          (sourceEnd < dictEnd)) {
+         streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+         if (streamPtr->dictSize > 64 KB)
+            streamPtr->dictSize = 64 KB;
+         if (streamPtr->dictSize < 4)
+            streamPtr->dictSize = 0;
+         streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
+      }
+   }
+
+   /* prefix mode : source data follows dictionary */
+   if (dictEnd == source) {
+      if ((streamPtr->dictSize < 64 KB) &&
+          (streamPtr->dictSize < streamPtr->currentOffset))
+         return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, tableType,
+                                     withPrefix64k, dictSmall, acceleration);
+      else
+         return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, tableType,
+                                     withPrefix64k, noDictIssue, acceleration);
+   }
+
+   /* external dictionary mode */
+   {
+      int result;
+      if (streamPtr->dictCtx) {
+         /* We depend here on the fact that dictCtx'es (produced by
+          * LZ4_loadDict) guarantee that their tables contain no references
+          * to offsets between dictCtx->currentOffset - 64 KB and
+          * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+          * to use noDictIssue even when the dict isn't a full 64 KB.
+          */
+         if (inputSize > 4 KB) {
+            /* For compressing large blobs, it is faster to pay the setup
+             * cost to copy the dictionary's tables into the active context,
+             * so that the compression loop is only looking into one table.
              */
-            if (inputSize > 4 KB) {
-                /* For compressing large blobs, it is faster to pay the setup
-                 * cost to copy the dictionary's tables into the active context,
-                 * so that the compression loop is only looking into one table.
-                 */
-                LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
-                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
-            } else {
-                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
-            }
-        } else {  /* small data <= 4 KB */
-            if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
-                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
-            } else {
-                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
-            }
-        }
-        streamPtr->dictionary = (const BYTE*)source;
-        streamPtr->dictSize = (U32)inputSize;
-        return result;
-    }
+            LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+            result =
+               LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                    maxOutputSize, limitedOutput, tableType,
+                                    usingExtDict, noDictIssue, acceleration);
+         } else {
+            result =
+               LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                    maxOutputSize, limitedOutput, tableType,
+                                    usingDictCtx, noDictIssue, acceleration);
+         }
+      } else { /* small data <= 4 KB */
+         if ((streamPtr->dictSize < 64 KB) &&
+             (streamPtr->dictSize < streamPtr->currentOffset)) {
+            result = LZ4_compress_generic(
+               streamPtr, source, dest, inputSize, NULL, maxOutputSize,
+               limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+         } else {
+            result =
+               LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                    maxOutputSize, limitedOutput, tableType,
+                                    usingExtDict, noDictIssue, acceleration);
+         }
+      }
+      streamPtr->dictionary = (const BYTE*)source;
+      streamPtr->dictSize   = (U32)inputSize;
+      return result;
+   }
 }
 
-
 /* Hidden debug function, to force-test external dictionary mode */
-int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize)
+int LZ4_compress_forceExtDict(LZ4_stream_t* LZ4_dict,
+                              const char*   source,
+                              char*         dest,
+                              int           srcSize)
 {
-    LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse;
-    int result;
+   LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse;
+   int                          result;
 
-    LZ4_renormDictT(streamPtr, srcSize);
+   LZ4_renormDictT(streamPtr, srcSize);
 
-    if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
-        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1);
-    } else {
-        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
-    }
+   if ((streamPtr->dictSize < 64 KB) &&
+       (streamPtr->dictSize < streamPtr->currentOffset)) {
+      result =
+         LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0,
+                              notLimited, byU32, usingExtDict, dictSmall, 1);
+   } else {
+      result =
+         LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0,
+                              notLimited, byU32, usingExtDict, noDictIssue, 1);
+   }
 
-    streamPtr->dictionary = (const BYTE*)source;
-    streamPtr->dictSize = (U32)srcSize;
+   streamPtr->dictionary = (const BYTE*)source;
+   streamPtr->dictSize   = (U32)srcSize;
 
-    return result;
+   return result;
 }
 
-
 /*! LZ4_saveDict() :
- *  If previously compressed data block is not guaranteed to remain available at its memory location,
- *  save it into a safer place (char* safeBuffer).
- *  Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable,
- *         one can therefore call LZ4_compress_fast_continue() right after.
- * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+ *  If previously compressed data block is not guaranteed to remain available at
+ * its memory location, save it into a safer place (char* safeBuffer). Note : no
+ * need to call LZ4_loadDict() afterwards, dictionary is immediately usable, one
+ * can therefore call LZ4_compress_fast_continue() right after.
+ * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if
+ * error.
  */
-int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+int LZ4_saveDict(LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
 {
-    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+   LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
 
-    DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, (void*)safeBuffer);
+   DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize,
+            (void*)safeBuffer);
 
-    if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */
-    if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; }
+   if ((U32)dictSize > 64 KB) {
+      dictSize = 64 KB;
+   } /* useless to define a dictionary > 64 KB */
+   if ((U32)dictSize > dict->dictSize) {
+      dictSize = (int)dict->dictSize;
+   }
 
-    if (safeBuffer == NULL) assert(dictSize == 0);
-    if (dictSize > 0) {
-        const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
-        assert(dict->dictionary);
-        LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
-    }
+   if (safeBuffer == NULL)
+      assert(dictSize == 0);
+   if (dictSize > 0) {
+      const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+      assert(dict->dictionary);
+      LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
+   }
 
-    dict->dictionary = (const BYTE*)safeBuffer;
-    dict->dictSize = (U32)dictSize;
+   dict->dictionary = (const BYTE*)safeBuffer;
+   dict->dictSize   = (U32)dictSize;
 
-    return dictSize;
+   return dictSize;
 }
 
-
-
 /*-*******************************
  *  Decompression functions
  ********************************/
@@ -1845,8 +2223,7 @@ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
 typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
 
 #undef MIN
-#define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
-
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
 
 /* variant for decompress_unsafe()
  * does not know end of input
@@ -1854,10 +2231,15 @@ typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
  * note : will consume at least one byte */
 static size_t read_long_length_no_check(const BYTE** pp)
 {
-    size_t b, l = 0;
-    do { b = **pp; (*pp)++; l += b; } while (b==255);
-    DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1)
-    return l;
+   size_t b, l = 0;
+   do {
+      b = **pp;
+      (*pp)++;
+      l += b;
+   } while (b == 255);
+   DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes",
+            l, l / 255 + 1)
+   return l;
 }
 
 /* core decoder variant for LZ4_decompress_fast*()
@@ -1868,702 +2250,842 @@ static size_t read_long_length_no_check(const BYTE** pp)
  * @return : nb of bytes read from input.
  * Note : this variant is not optimized for speed, just for maintenance.
  *        the goal is to remove support of decompress_fast*() variants by v2.0
-**/
-LZ4_FORCE_INLINE int
-LZ4_decompress_unsafe_generic(
-                 const BYTE* const istart,
-                 BYTE* const ostart,
-                 int decompressedSize,
-
-                 size_t prefixSize,
-                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
-                 const size_t dictSize         /* note: =0 if dictStart==NULL */
-                 )
-{
-    const BYTE* ip = istart;
-    BYTE* op = (BYTE*)ostart;
-    BYTE* const oend = ostart + decompressedSize;
-    const BYTE* const prefixStart = ostart - prefixSize;
-
-    DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
-    if (dictStart == NULL) assert(dictSize == 0);
-
-    while (1) {
-        /* start new sequence */
-        unsigned token = *ip++;
-
-        /* literals */
-        {   size_t ll = token >> ML_BITS;
-            if (ll==15) {
-                /* long literal length */
-                ll += read_long_length_no_check(&ip);
+ **/
+LZ4_FORCE_INLINE int LZ4_decompress_unsafe_generic(
+   const BYTE* const istart,
+   BYTE* const       ostart,
+   int               decompressedSize,
+
+   size_t            prefixSize,
+   const BYTE* const dictStart, /* only if dict==usingExtDict */
+   const size_t      dictSize   /* note: =0 if dictStart==NULL */
+)
+{
+   const BYTE*       ip          = istart;
+   BYTE*             op          = (BYTE*)ostart;
+   BYTE* const       oend        = ostart + decompressedSize;
+   const BYTE* const prefixStart = ostart - prefixSize;
+
+   DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
+   if (dictStart == NULL)
+      assert(dictSize == 0);
+
+   while (1) {
+      /* start new sequence */
+      unsigned token = *ip++;
+
+      /* literals */
+      {
+         size_t ll = token >> ML_BITS;
+         if (ll == 15) {
+            /* long literal length */
+            ll += read_long_length_no_check(&ip);
+         }
+         if ((size_t)(oend - op) < ll)
+            return -1;            /* output buffer overflow */
+         LZ4_memmove(op, ip, ll); /* support in-place decompression */
+         op += ll;
+         ip += ll;
+         if ((size_t)(oend - op) < MFLIMIT) {
+            if (op == oend)
+               break; /* end of block */
+            DEBUGLOG(5,
+                     "invalid: literals end at distance %zi from end of block",
+                     oend - op);
+            /* incorrect end of block :
+             * last match must start at least MFLIMIT==12 bytes before end of
+             * output block */
+            return -1;
+         }
+      }
+
+      /* match */
+      {
+         size_t       ml     = token & 15;
+         size_t const offset = LZ4_readLE16(ip);
+         ip += 2;
+
+         if (ml == 15) {
+            /* long literal length */
+            ml += read_long_length_no_check(&ip);
+         }
+         ml += MINMATCH;
+
+         if ((size_t)(oend - op) < ml)
+            return -1; /* output buffer overflow */
+
+         {
+            const BYTE* match = op - offset;
+
+            /* out of range */
+            if (offset > (size_t)(op - prefixStart) + dictSize) {
+               DEBUGLOG(6, "offset out of range");
+               return -1;
             }
-            if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */
-            LZ4_memmove(op, ip, ll); /* support in-place decompression */
-            op += ll;
-            ip += ll;
-            if ((size_t)(oend-op) < MFLIMIT) {
-                if (op==oend) break;  /* end of block */
-                DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op);
-                /* incorrect end of block :
-                 * last match must start at least MFLIMIT==12 bytes before end of output block */
-                return -1;
-        }   }
-
-        /* match */
-        {   size_t ml = token & 15;
-            size_t const offset = LZ4_readLE16(ip);
-            ip+=2;
-
-            if (ml==15) {
-                /* long literal length */
-                ml += read_long_length_no_check(&ip);
+
+            /* check special case : extDict */
+            if (offset > (size_t)(op - prefixStart)) {
+               /* extDict scenario */
+               const BYTE* const dictEnd = dictStart + dictSize;
+               const BYTE*       extMatch =
+                  dictEnd - (offset - (size_t)(op - prefixStart));
+               size_t const extml = (size_t)(dictEnd - extMatch);
+               if (extml > ml) {
+                  /* match entirely within extDict */
+                  LZ4_memmove(op, extMatch, ml);
+                  op += ml;
+                  ml = 0;
+               } else {
+                  /* match split between extDict & prefix */
+                  LZ4_memmove(op, extMatch, extml);
+                  op += extml;
+                  ml -= extml;
+               }
+               match = prefixStart;
             }
-            ml += MINMATCH;
-
-            if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */
-
-            {   const BYTE* match = op - offset;
-
-                /* out of range */
-                if (offset > (size_t)(op - prefixStart) + dictSize) {
-                    DEBUGLOG(6, "offset out of range");
-                    return -1;
-                }
-
-                /* check special case : extDict */
-                if (offset > (size_t)(op - prefixStart)) {
-                    /* extDict scenario */
-                    const BYTE* const dictEnd = dictStart + dictSize;
-                    const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart));
-                    size_t const extml = (size_t)(dictEnd - extMatch);
-                    if (extml > ml) {
-                        /* match entirely within extDict */
-                        LZ4_memmove(op, extMatch, ml);
-                        op += ml;
-                        ml = 0;
-                    } else {
-                        /* match split between extDict & prefix */
-                        LZ4_memmove(op, extMatch, extml);
-                        op += extml;
-                        ml -= extml;
-                    }
-                    match = prefixStart;
-                }
-
-                /* match copy - slow variant, supporting overlap copy */
-                {   size_t u;
-                    for (u=0; u<ml; u++) {
-                        op[u] = match[u];
-            }   }   }
-            op += ml;
-            if ((size_t)(oend-op) < LASTLITERALS) {
-                DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", oend-op);
-                /* incorrect end of block :
-                 * last match must stop at least LASTLITERALS==5 bytes before end of output block */
-                return -1;
+
+            /* match copy - slow variant, supporting overlap copy */
+            {
+               size_t u;
+               for (u = 0; u < ml; u++) {
+                  op[u] = match[u];
+               }
             }
-        } /* match */
-    } /* main loop */
-    return (int)(ip - istart);
+         }
+         op += ml;
+         if ((size_t)(oend - op) < LASTLITERALS) {
+            DEBUGLOG(5, "invalid: match ends at distance %zi from end of block",
+                     oend - op);
+            /* incorrect end of block :
+             * last match must stop at least LASTLITERALS==5 bytes before end of
+             * output block */
+            return -1;
+         }
+      } /* match */
+   }    /* main loop */
+   return (int)(ip - istart);
 }
 
-
 /* Read the variable-length literal or match length.
  *
  * @ip : input pointer
- * @ilimit : position after which if length is not decoded, the input is necessarily corrupted.
- * @initial_check - check ip >= ipmax before start of loop.  Returns initial_error if so.
+ * @ilimit : position after which if length is not decoded, the input is
+ *necessarily corrupted.
+ * @initial_check - check ip >= ipmax before start of loop.  Returns
+ *initial_error if so.
  * @error (output) - error code.  Must be set to 0 before call.
-**/
-typedef size_t Rvl_t;
-static const Rvl_t rvl_error = (Rvl_t)(-1);
-LZ4_FORCE_INLINE Rvl_t
-read_variable_length(const BYTE** ip, const BYTE* ilimit,
-                     int initial_check)
-{
-    Rvl_t s, length = 0;
-    assert(ip != NULL);
-    assert(*ip !=  NULL);
-    assert(ilimit != NULL);
-    if (initial_check && unlikely((*ip) >= ilimit)) {    /* read limit reached */
-        return rvl_error;
-    }
-    s = **ip;
-    (*ip)++;
-    length += s;
-    if (unlikely((*ip) > ilimit)) {    /* read limit reached */
-        return rvl_error;
-    }
-    /* accumulator overflow detection (32-bit mode only) */
-    if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
-        return rvl_error;
-    }
-    if (likely(s != 255)) return length;
-    do {
-        s = **ip;
-        (*ip)++;
-        length += s;
-        if (unlikely((*ip) > ilimit)) {    /* read limit reached */
-            return rvl_error;
-        }
-        /* accumulator overflow detection (32-bit mode only) */
-        if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
-            return rvl_error;
-        }
-    } while (s == 255);
-
-    return length;
+ **/
+typedef size_t         Rvl_t;
+static const Rvl_t     rvl_error = (Rvl_t)(-1);
+LZ4_FORCE_INLINE Rvl_t read_variable_length(const BYTE** ip,
+                                            const BYTE*  ilimit,
+                                            int          initial_check)
+{
+   Rvl_t s, length = 0;
+   assert(ip != NULL);
+   assert(*ip != NULL);
+   assert(ilimit != NULL);
+   if (initial_check && unlikely((*ip) >= ilimit)) { /* read limit reached */
+      return rvl_error;
+   }
+   s = **ip;
+   (*ip)++;
+   length += s;
+   if (unlikely((*ip) > ilimit)) { /* read limit reached */
+      return rvl_error;
+   }
+   /* accumulator overflow detection (32-bit mode only) */
+   if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1) / 2))) {
+      return rvl_error;
+   }
+   if (likely(s != 255))
+      return length;
+   do {
+      s = **ip;
+      (*ip)++;
+      length += s;
+      if (unlikely((*ip) > ilimit)) { /* read limit reached */
+         return rvl_error;
+      }
+      /* accumulator overflow detection (32-bit mode only) */
+      if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1) / 2))) {
+         return rvl_error;
+      }
+   } while (s == 255);
+
+   return length;
 }
 
 /*! LZ4_decompress_generic() :
  *  This generic decompression function covers all use cases.
  *  It shall be instantiated several times, using different sets of directives.
- *  Note that it is important for performance that this function really get inlined,
- *  in order to remove useless branches during compilation optimization.
+ *  Note that it is important for performance that this function really get
+ * inlined, in order to remove useless branches during compilation optimization.
  */
-LZ4_FORCE_INLINE int
-LZ4_decompress_generic(
-                 const char* const src,
-                 char* const dst,
-                 int srcSize,
-                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
-
-                 earlyEnd_directive partialDecoding,  /* full, partial */
-                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
-                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
-                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
-                 const size_t dictSize         /* note : = 0 if noDict */
-                 )
-{
-    if ((src == NULL) || (outputSize < 0)) { return -1; }
-
-    {   const BYTE* ip = (const BYTE*) src;
-        const BYTE* const iend = ip + srcSize;
-
-        BYTE* op = (BYTE*) dst;
-        BYTE* const oend = op + outputSize;
-        BYTE* cpy;
-
-        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
-
-        const int checkOffset = (dictSize < (int)(64 KB));
-
-
-        /* Set up the "end" pointers for the shortcut. */
-        const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
-        const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
-
-        const BYTE* match;
-        size_t offset;
-        unsigned token;
-        size_t length;
-
-
-        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
-
-        /* Special cases */
-        assert(lowPrefix <= op);
-        if (unlikely(outputSize==0)) {
-            /* Empty output buffer */
-            if (partialDecoding) return 0;
-            return ((srcSize==1) && (*ip==0)) ? 0 : -1;
-        }
-        if (unlikely(srcSize==0)) { return -1; }
-
-    /* LZ4_FAST_DEC_LOOP:
-     * designed for modern OoO performance cpus,
-     * where copying reliably 32-bytes is preferable to an unpredictable branch.
-     * note : fast loop may show a regression for some client arm chips. */
+LZ4_FORCE_INLINE int LZ4_decompress_generic(
+   const char* const src,
+   char* const       dst,
+   int               srcSize,
+   int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity`
+                    */
+
+   earlyEnd_directive partialDecoding, /* full, partial */
+   dict_directive     dict,            /* noDict, withPrefix64k, usingExtDict */
+   const BYTE* const  lowPrefix, /* always <= dst, == dst when no prefix */
+   const BYTE* const  dictStart, /* only if dict==usingExtDict */
+   const size_t       dictSize   /* note : = 0 if noDict */
+)
+{
+   if ((src == NULL) || (outputSize < 0)) {
+      return -1;
+   }
+
+   {
+      const BYTE*       ip   = (const BYTE*)src;
+      const BYTE* const iend = ip + srcSize;
+
+      BYTE*       op   = (BYTE*)dst;
+      BYTE* const oend = op + outputSize;
+      BYTE*       cpy;
+
+      const BYTE* const dictEnd =
+         (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+      const int checkOffset = (dictSize < (int)(64 KB));
+
+      /* Set up the "end" pointers for the shortcut. */
+      const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
+      const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
+
+      const BYTE* match;
+      size_t      offset;
+      unsigned    token;
+      size_t      length;
+
+      DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize,
+               outputSize);
+
+      /* Special cases */
+      assert(lowPrefix <= op);
+      if (unlikely(outputSize == 0)) {
+         /* Empty output buffer */
+         if (partialDecoding)
+            return 0;
+         return ((srcSize == 1) && (*ip == 0)) ? 0 : -1;
+      }
+      if (unlikely(srcSize == 0)) {
+         return -1;
+      }
+
+      /* LZ4_FAST_DEC_LOOP:
+       * designed for modern OoO performance cpus,
+       * where copying reliably 32-bytes is preferable to an unpredictable
+       * branch. note : fast loop may show a regression for some client arm
+       * chips. */
 #if LZ4_FAST_DEC_LOOP
-        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
-            DEBUGLOG(6, "move to safe decode loop");
-            goto safe_decode;
-        }
-
-        /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */
-        DEBUGLOG(6, "using fast decode loop");
-        while (1) {
-            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
-            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
-            assert(ip < iend);
-            token = *ip++;
-            length = token >> ML_BITS;  /* literal length */
-            DEBUGLOG(7, "blockPos%6u: litLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length);
-
-            /* decode literal length */
-            if (length == RUN_MASK) {
-                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
-                if (addl == rvl_error) {
-                    DEBUGLOG(6, "error reading long literal length");
-                    goto _output_error;
-                }
-                length += addl;
-                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
-                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
-
-                /* copy literals */
-                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
-                if ((op+length>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
-                LZ4_wildCopy32(op, ip, op+length);
-                ip += length; op += length;
-            } else if (ip <= iend-(16 + 1/*max lit + offset + nextToken*/)) {
-                /* We don't need to check oend, since we check it once for each loop below */
-                DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
-                /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */
-                LZ4_memcpy(op, ip, 16);
-                ip += length; op += length;
-            } else {
-                goto safe_literal_copy;
+      if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+         DEBUGLOG(6, "move to safe decode loop");
+         goto safe_decode;
+      }
+
+      /* Fast loop : decode sequences as long as output <
+       * oend-FASTLOOP_SAFE_DISTANCE */
+      DEBUGLOG(6, "using fast decode loop");
+      while (1) {
+         /* Main fastloop assertion: We can always wildcopy
+          * FASTLOOP_SAFE_DISTANCE */
+         assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+         assert(ip < iend);
+         token  = *ip++;
+         length = token >> ML_BITS; /* literal length */
+         DEBUGLOG(7, "blockPos%6u: litLength token = %u",
+                  (unsigned)(op - (BYTE*)dst), (unsigned)length);
+
+         /* decode literal length */
+         if (length == RUN_MASK) {
+            size_t const addl = read_variable_length(&ip, iend - RUN_MASK, 1);
+            if (addl == rvl_error) {
+               DEBUGLOG(6, "error reading long literal length");
+               goto _output_error;
             }
+            length += addl;
+            if (unlikely((uptrval)(op) + length < (uptrval)(op))) {
+               goto _output_error;
+            } /* overflow detection */
+            if (unlikely((uptrval)(ip) + length < (uptrval)(ip))) {
+               goto _output_error;
+            } /* overflow detection */
 
-            /* get offset */
-            offset = LZ4_readLE16(ip); ip+=2;
-            DEBUGLOG(6, "blockPos%6u: offset = %u", (unsigned)(op-(BYTE*)dst), (unsigned)offset);
-            match = op - offset;
-            assert(match <= op);  /* overflow check */
-
-            /* get matchlength */
-            length = token & ML_MASK;
-            DEBUGLOG(7, "  match length token = %u (len==%u)", (unsigned)length, (unsigned)length+MINMATCH);
-
-            if (length == ML_MASK) {
-                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
-                if (addl == rvl_error) {
-                    DEBUGLOG(5, "error reading long match length");
-                    goto _output_error;
-                }
-                length += addl;
-                length += MINMATCH;
-                DEBUGLOG(7, "  long match length == %u", (unsigned)length);
-                if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
-                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
-                    goto safe_match_copy;
-                }
-            } else {
-                length += MINMATCH;
-                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
-                    DEBUGLOG(7, "moving to safe_match_copy (ml==%u)", (unsigned)length);
-                    goto safe_match_copy;
-                }
-
-                /* Fastpath check: skip LZ4_wildCopy32 when true */
-                if ((dict == withPrefix64k) || (match >= lowPrefix)) {
-                    if (offset >= 8) {
-                        assert(match >= lowPrefix);
-                        assert(match <= op);
-                        assert(op + 18 <= oend);
-
-                        LZ4_memcpy(op, match, 8);
-                        LZ4_memcpy(op+8, match+8, 8);
-                        LZ4_memcpy(op+16, match+16, 2);
-                        op += length;
-                        continue;
-            }   }   }
-
-            if ( checkOffset && (unlikely(match + dictSize < lowPrefix)) ) {
-                DEBUGLOG(5, "Error : pos=%zi, offset=%zi => outside buffers", op-lowPrefix, op-match);
-                goto _output_error;
+            /* copy literals */
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ((op + length > oend - 32) || (ip + length > iend - 32)) {
+               goto safe_literal_copy;
             }
-            /* match starting within external dictionary */
-            if ((dict==usingExtDict) && (match < lowPrefix)) {
-                assert(dictEnd != NULL);
-                if (unlikely(op+length > oend-LASTLITERALS)) {
-                    if (partialDecoding) {
-                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
-                        length = MIN(length, (size_t)(oend-op));
-                    } else {
-                        DEBUGLOG(6, "end-of-block condition violated")
-                        goto _output_error;
-                }   }
-
-                if (length <= (size_t)(lowPrefix-match)) {
-                    /* match fits entirely within external dictionary : just copy */
-                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
-                    op += length;
-                } else {
-                    /* match stretches into both external dictionary and current block */
-                    size_t const copySize = (size_t)(lowPrefix - match);
-                    size_t const restSize = length - copySize;
-                    LZ4_memcpy(op, dictEnd - copySize, copySize);
-                    op += copySize;
-                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
-                        BYTE* const endOfMatch = op + restSize;
-                        const BYTE* copyFrom = lowPrefix;
-                        while (op < endOfMatch) { *op++ = *copyFrom++; }
-                    } else {
-                        LZ4_memcpy(op, lowPrefix, restSize);
-                        op += restSize;
-                }   }
-                continue;
+            LZ4_wildCopy32(op, ip, op + length);
+            ip += length;
+            op += length;
+         } else if (ip <= iend - (16 + 1 /*max lit + offset + nextToken*/)) {
+            /* We don't need to check oend, since we check it once for each loop
+             * below */
+            DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+            /* Literals can only be <= 14, but hope compilers optimize better
+             * when copy by a register size */
+            LZ4_memcpy(op, ip, 16);
+            ip += length;
+            op += length;
+         } else {
+            goto safe_literal_copy;
+         }
+
+         /* get offset */
+         offset = LZ4_readLE16(ip);
+         ip += 2;
+         DEBUGLOG(6, "blockPos%6u: offset = %u", (unsigned)(op - (BYTE*)dst),
+                  (unsigned)offset);
+         match = op - offset;
+         assert(match <= op); /* overflow check */
+
+         /* get matchlength */
+         length = token & ML_MASK;
+         DEBUGLOG(7, "  match length token = %u (len==%u)", (unsigned)length,
+                  (unsigned)length + MINMATCH);
+
+         if (length == ML_MASK) {
+            size_t const addl =
+               read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+            if (addl == rvl_error) {
+               DEBUGLOG(5, "error reading long match length");
+               goto _output_error;
+            }
+            length += addl;
+            length += MINMATCH;
+            DEBUGLOG(7, "  long match length == %u", (unsigned)length);
+            if (unlikely((uptrval)(op) + length < (uptrval)op)) {
+               goto _output_error;
+            } /* overflow detection */
+            if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+               goto safe_match_copy;
+            }
+         } else {
+            length += MINMATCH;
+            if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+               DEBUGLOG(7, "moving to safe_match_copy (ml==%u)",
+                        (unsigned)length);
+               goto safe_match_copy;
             }
 
-            /* copy match within block */
-            cpy = op + length;
+            /* Fastpath check: skip LZ4_wildCopy32 when true */
+            if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+               if (offset >= 8) {
+                  assert(match >= lowPrefix);
+                  assert(match <= op);
+                  assert(op + 18 <= oend);
+
+                  LZ4_memcpy(op, match, 8);
+                  LZ4_memcpy(op + 8, match + 8, 8);
+                  LZ4_memcpy(op + 16, match + 16, 2);
+                  op += length;
+                  continue;
+               }
+            }
+         }
+
+         if (checkOffset && (unlikely(match + dictSize < lowPrefix))) {
+            DEBUGLOG(5, "Error : pos=%zi, offset=%zi => outside buffers",
+                     op - lowPrefix, op - match);
+            goto _output_error;
+         }
+         /* match starting within external dictionary */
+         if ((dict == usingExtDict) && (match < lowPrefix)) {
+            assert(dictEnd != NULL);
+            if (unlikely(op + length > oend - LASTLITERALS)) {
+               if (partialDecoding) {
+                  DEBUGLOG(
+                     7, "partialDecoding: dictionary match, close to dstEnd");
+                  length = MIN(length, (size_t)(oend - op));
+               } else {
+                  DEBUGLOG(6, "end-of-block condition violated")
+                  goto _output_error;
+               }
+            }
 
-            assert((op <= oend) && (oend-op >= 32));
-            if (unlikely(offset<16)) {
-                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            if (length <= (size_t)(lowPrefix - match)) {
+               /* match fits entirely within external dictionary : just copy */
+               LZ4_memmove(op, dictEnd - (lowPrefix - match), length);
+               op += length;
             } else {
-                LZ4_wildCopy32(op, match, cpy);
+               /* match stretches into both external dictionary and current
+                * block */
+               size_t const copySize = (size_t)(lowPrefix - match);
+               size_t const restSize = length - copySize;
+               LZ4_memcpy(op, dictEnd - copySize, copySize);
+               op += copySize;
+               if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
+                  BYTE* const endOfMatch = op + restSize;
+                  const BYTE* copyFrom   = lowPrefix;
+                  while (op < endOfMatch) {
+                     *op++ = *copyFrom++;
+                  }
+               } else {
+                  LZ4_memcpy(op, lowPrefix, restSize);
+                  op += restSize;
+               }
             }
-
-            op = cpy;   /* wildcopy correction */
-        }
-    safe_decode:
+            continue;
+         }
+
+         /* copy match within block */
+         cpy = op + length;
+
+         assert((op <= oend) && (oend - op >= 32));
+         if (unlikely(offset < 16)) {
+            LZ4_memcpy_using_offset(op, match, cpy, offset);
+         } else {
+            LZ4_wildCopy32(op, match, cpy);
+         }
+
+         op = cpy; /* wildcopy correction */
+      }
+   safe_decode:
 #endif
 
-        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
-        DEBUGLOG(6, "using safe decode loop");
-        while (1) {
-            assert(ip < iend);
-            token = *ip++;
-            length = token >> ML_BITS;  /* literal length */
-            DEBUGLOG(7, "blockPos%6u: litLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length);
-
-            /* A two-stage shortcut for the most common case:
-             * 1) If the literal length is 0..14, and there is enough space,
-             * enter the shortcut and copy 16 bytes on behalf of the literals
-             * (in the fast mode, only 8 bytes can be safely copied this way).
-             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
-             * manner; but we ensure that there's enough space in the output for
-             * those 18 bytes earlier, upon entering the shortcut (in other words,
-             * there is a combined check for both stages).
-             */
-            if ( (length != RUN_MASK)
-                /* strictly "less than" on input, to re-enter the loop with at least one byte */
-              && likely((ip < shortiend) & (op <= shortoend)) ) {
-                /* Copy the literals */
-                LZ4_memcpy(op, ip, 16);
-                op += length; ip += length;
-
-                /* The second stage: prepare for match copying, decode full info.
-                 * If it doesn't work out, the info won't be wasted. */
-                length = token & ML_MASK; /* match length */
-                DEBUGLOG(7, "blockPos%6u: matchLength token = %u (len=%u)", (unsigned)(op-(BYTE*)dst), (unsigned)length, (unsigned)length + 4);
-                offset = LZ4_readLE16(ip); ip += 2;
-                match = op - offset;
-                assert(match <= op); /* check overflow */
-
-                /* Do not deal with overlapping matches. */
-                if ( (length != ML_MASK)
-                  && (offset >= 8)
-                  && (dict==withPrefix64k || match >= lowPrefix) ) {
-                    /* Copy the match. */
-                    LZ4_memcpy(op + 0, match + 0, 8);
-                    LZ4_memcpy(op + 8, match + 8, 8);
-                    LZ4_memcpy(op +16, match +16, 2);
-                    op += length + MINMATCH;
-                    /* Both stages worked, load the next token. */
-                    continue;
-                }
-
-                /* The second stage didn't work out, but the info is ready.
-                 * Propel it right to the point of match copying. */
-                goto _copy_match;
+      /* Main Loop : decode remaining sequences where output <
+       * FASTLOOP_SAFE_DISTANCE */
+      DEBUGLOG(6, "using safe decode loop");
+      while (1) {
+         assert(ip < iend);
+         token  = *ip++;
+         length = token >> ML_BITS; /* literal length */
+         DEBUGLOG(7, "blockPos%6u: litLength token = %u",
+                  (unsigned)(op - (BYTE*)dst), (unsigned)length);
+
+         /* A two-stage shortcut for the most common case:
+          * 1) If the literal length is 0..14, and there is enough space,
+          * enter the shortcut and copy 16 bytes on behalf of the literals
+          * (in the fast mode, only 8 bytes can be safely copied this way).
+          * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+          * manner; but we ensure that there's enough space in the output for
+          * those 18 bytes earlier, upon entering the shortcut (in other words,
+          * there is a combined check for both stages).
+          */
+         if ((length != RUN_MASK)
+             /* strictly "less than" on input, to re-enter the loop with at
+                least one byte */
+             && likely((ip < shortiend) & (op <= shortoend))) {
+            /* Copy the literals */
+            LZ4_memcpy(op, ip, 16);
+            op += length;
+            ip += length;
+
+            /* The second stage: prepare for match copying, decode full info.
+             * If it doesn't work out, the info won't be wasted. */
+            length = token & ML_MASK; /* match length */
+            DEBUGLOG(7, "blockPos%6u: matchLength token = %u (len=%u)",
+                     (unsigned)(op - (BYTE*)dst), (unsigned)length,
+                     (unsigned)length + 4);
+            offset = LZ4_readLE16(ip);
+            ip += 2;
+            match = op - offset;
+            assert(match <= op); /* check overflow */
+
+            /* Do not deal with overlapping matches. */
+            if ((length != ML_MASK) && (offset >= 8) &&
+                (dict == withPrefix64k || match >= lowPrefix)) {
+               /* Copy the match. */
+               LZ4_memcpy(op + 0, match + 0, 8);
+               LZ4_memcpy(op + 8, match + 8, 8);
+               LZ4_memcpy(op + 16, match + 16, 2);
+               op += length + MINMATCH;
+               /* Both stages worked, load the next token. */
+               continue;
             }
 
-            /* decode literal length */
-            if (length == RUN_MASK) {
-                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
-                if (addl == rvl_error) { goto _output_error; }
-                length += addl;
-                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
-                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+            /* The second stage didn't work out, but the info is ready.
+             * Propel it right to the point of match copying. */
+            goto _copy_match;
+         }
+
+         /* decode literal length */
+         if (length == RUN_MASK) {
+            size_t const addl = read_variable_length(&ip, iend - RUN_MASK, 1);
+            if (addl == rvl_error) {
+               goto _output_error;
             }
+            length += addl;
+            if (unlikely((uptrval)(op) + length < (uptrval)(op))) {
+               goto _output_error;
+            } /* overflow detection */
+            if (unlikely((uptrval)(ip) + length < (uptrval)(ip))) {
+               goto _output_error;
+            } /* overflow detection */
+         }
 
 #if LZ4_FAST_DEC_LOOP
-        safe_literal_copy:
+      safe_literal_copy:
 #endif
-            /* copy literals */
-            cpy = op+length;
-
-            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
-            if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) {
-                /* We've either hit the input parsing restriction or the output parsing restriction.
-                 * In the normal scenario, decoding a full block, it must be the last sequence,
-                 * otherwise it's an error (invalid input or dimensions).
-                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
-                 */
-                if (partialDecoding) {
-                    /* Since we are partial decoding we may be in this block because of the output parsing
-                     * restriction, which is not valid since the output buffer is allowed to be undersized.
-                     */
-                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
-                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
-                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
-                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
-                    /* Finishing in the middle of a literals segment,
-                     * due to lack of input.
-                     */
-                    if (ip+length > iend) {
-                        length = (size_t)(iend-ip);
-                        cpy = op + length;
-                    }
-                    /* Finishing in the middle of a literals segment,
-                     * due to lack of output space.
-                     */
-                    if (cpy > oend) {
-                        cpy = oend;
-                        assert(op<=oend);
-                        length = (size_t)(oend-op);
-                    }
-                } else {
-                     /* We must be on the last sequence (or invalid) because of the parsing limitations
-                      * so check that we exactly consume the input and don't overrun the output buffer.
-                      */
-                    if ((ip+length != iend) || (cpy > oend)) {
-                        DEBUGLOG(5, "should have been last run of literals")
-                        DEBUGLOG(5, "ip(%p) + length(%i) = %p != iend (%p)", (void*)ip, (int)length, (void*)(ip+length), (void*)iend);
-                        DEBUGLOG(5, "or cpy(%p) > (oend-MFLIMIT)(%p)", (void*)cpy, (void*)(oend-MFLIMIT));
-                        DEBUGLOG(5, "after writing %u bytes / %i bytes available", (unsigned)(op-(BYTE*)dst), outputSize);
-                        goto _output_error;
-                    }
-                }
-                LZ4_memmove(op, ip, length);  /* supports overlapping memory regions, for in-place decompression scenarios */
-                ip += length;
-                op += length;
-                /* Necessarily EOF when !partialDecoding.
-                 * When partialDecoding, it is EOF if we've either
-                 * filled the output buffer or
-                 * can't proceed with reading an offset for following match.
-                 */
-                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
-                    break;
-                }
+         /* copy literals */
+         cpy = op + length;
+
+         LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+         if ((cpy > oend - MFLIMIT) ||
+             (ip + length > iend - (2 + 1 + LASTLITERALS))) {
+            /* We've either hit the input parsing restriction or the output
+             * parsing restriction. In the normal scenario, decoding a full
+             * block, it must be the last sequence, otherwise it's an error
+             * (invalid input or dimensions). In partialDecoding scenario, it's
+             * necessary to ensure there is no buffer overflow.
+             */
+            if (partialDecoding) {
+               /* Since we are partial decoding we may be in this block because
+                * of the output parsing restriction, which is not valid since
+                * the output buffer is allowed to be undersized.
+                */
+               DEBUGLOG(7, "partialDecoding: copying literals, close to input "
+                           "or output end")
+               DEBUGLOG(7, "partialDecoding: literal length = %u",
+                        (unsigned)length);
+               DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i",
+                        (int)(oend - op));
+               DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i",
+                        (int)(iend - ip));
+               /* Finishing in the middle of a literals segment,
+                * due to lack of input.
+                */
+               if (ip + length > iend) {
+                  length = (size_t)(iend - ip);
+                  cpy    = op + length;
+               }
+               /* Finishing in the middle of a literals segment,
+                * due to lack of output space.
+                */
+               if (cpy > oend) {
+                  cpy = oend;
+                  assert(op <= oend);
+                  length = (size_t)(oend - op);
+               }
             } else {
-                LZ4_wildCopy8(op, ip, cpy);   /* can overwrite up to 8 bytes beyond cpy */
-                ip += length; op = cpy;
+               /* We must be on the last sequence (or invalid) because of the
+                * parsing limitations so check that we exactly consume the input
+                * and don't overrun the output buffer.
+                */
+               if ((ip + length != iend) || (cpy > oend)) {
+                  DEBUGLOG(5, "should have been last run of literals")
+                  DEBUGLOG(5, "ip(%p) + length(%i) = %p != iend (%p)",
+                           (void*)ip, (int)length, (void*)(ip + length),
+                           (void*)iend);
+                  DEBUGLOG(5, "or cpy(%p) > (oend-MFLIMIT)(%p)", (void*)cpy,
+                           (void*)(oend - MFLIMIT));
+                  DEBUGLOG(5, "after writing %u bytes / %i bytes available",
+                           (unsigned)(op - (BYTE*)dst), outputSize);
+                  goto _output_error;
+               }
             }
-
-            /* get offset */
-            offset = LZ4_readLE16(ip); ip+=2;
-            match = op - offset;
-
-            /* get matchlength */
-            length = token & ML_MASK;
-            DEBUGLOG(7, "blockPos%6u: matchLength token = %u", (unsigned)(op-(BYTE*)dst), (unsigned)length);
-
-    _copy_match:
-            if (length == ML_MASK) {
-                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
-                if (addl == rvl_error) { goto _output_error; }
-                length += addl;
-                if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            LZ4_memmove(op, ip,
+                        length); /* supports overlapping memory regions, for
+                                    in-place decompression scenarios */
+            ip += length;
+            op += length;
+            /* Necessarily EOF when !partialDecoding.
+             * When partialDecoding, it is EOF if we've either
+             * filled the output buffer or
+             * can't proceed with reading an offset for following match.
+             */
+            if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) {
+               break;
             }
-            length += MINMATCH;
+         } else {
+            LZ4_wildCopy8(op, ip,
+                          cpy); /* can overwrite up to 8 bytes beyond cpy */
+            ip += length;
+            op = cpy;
+         }
+
+         /* get offset */
+         offset = LZ4_readLE16(ip);
+         ip += 2;
+         match = op - offset;
+
+         /* get matchlength */
+         length = token & ML_MASK;
+         DEBUGLOG(7, "blockPos%6u: matchLength token = %u",
+                  (unsigned)(op - (BYTE*)dst), (unsigned)length);
+
+      _copy_match:
+         if (length == ML_MASK) {
+            size_t const addl =
+               read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+            if (addl == rvl_error) {
+               goto _output_error;
+            }
+            length += addl;
+            if (unlikely((uptrval)(op) + length < (uptrval)op))
+               goto _output_error; /* overflow detection */
+         }
+         length += MINMATCH;
 
 #if LZ4_FAST_DEC_LOOP
-        safe_match_copy:
+      safe_match_copy:
 #endif
-            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
-            /* match starting within external dictionary */
-            if ((dict==usingExtDict) && (match < lowPrefix)) {
-                assert(dictEnd != NULL);
-                if (unlikely(op+length > oend-LASTLITERALS)) {
-                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
-                    else goto _output_error;   /* doesn't respect parsing restriction */
-                }
-
-                if (length <= (size_t)(lowPrefix-match)) {
-                    /* match fits entirely within external dictionary : just copy */
-                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
-                    op += length;
-                } else {
-                    /* match stretches into both external dictionary and current block */
-                    size_t const copySize = (size_t)(lowPrefix - match);
-                    size_t const restSize = length - copySize;
-                    LZ4_memcpy(op, dictEnd - copySize, copySize);
-                    op += copySize;
-                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
-                        BYTE* const endOfMatch = op + restSize;
-                        const BYTE* copyFrom = lowPrefix;
-                        while (op < endOfMatch) *op++ = *copyFrom++;
-                    } else {
-                        LZ4_memcpy(op, lowPrefix, restSize);
-                        op += restSize;
-                }   }
-                continue;
-            }
-            assert(match >= lowPrefix);
-
-            /* copy match within block */
-            cpy = op + length;
-
-            /* partialDecoding : may end anywhere within the block */
-            assert(op<=oend);
-            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
-                size_t const mlen = MIN(length, (size_t)(oend-op));
-                const BYTE* const matchEnd = match + mlen;
-                BYTE* const copyEnd = op + mlen;
-                if (matchEnd > op) {   /* overlap copy */
-                    while (op < copyEnd) { *op++ = *match++; }
-                } else {
-                    LZ4_memcpy(op, match, mlen);
-                }
-                op = copyEnd;
-                if (op == oend) { break; }
-                continue;
+         if ((checkOffset) && (unlikely(match + dictSize < lowPrefix)))
+            goto _output_error; /* Error : offset outside buffers */
+         /* match starting within external dictionary */
+         if ((dict == usingExtDict) && (match < lowPrefix)) {
+            assert(dictEnd != NULL);
+            if (unlikely(op + length > oend - LASTLITERALS)) {
+               if (partialDecoding)
+                  length = MIN(length, (size_t)(oend - op));
+               else
+                  goto _output_error; /* doesn't respect parsing restriction */
             }
 
-            if (unlikely(offset<8)) {
-                LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
-                op[0] = match[0];
-                op[1] = match[1];
-                op[2] = match[2];
-                op[3] = match[3];
-                match += inc32table[offset];
-                LZ4_memcpy(op+4, match, 4);
-                match -= dec64table[offset];
+            if (length <= (size_t)(lowPrefix - match)) {
+               /* match fits entirely within external dictionary : just copy */
+               LZ4_memmove(op, dictEnd - (lowPrefix - match), length);
+               op += length;
             } else {
-                LZ4_memcpy(op, match, 8);
-                match += 8;
+               /* match stretches into both external dictionary and current
+                * block */
+               size_t const copySize = (size_t)(lowPrefix - match);
+               size_t const restSize = length - copySize;
+               LZ4_memcpy(op, dictEnd - copySize, copySize);
+               op += copySize;
+               if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
+                  BYTE* const endOfMatch = op + restSize;
+                  const BYTE* copyFrom   = lowPrefix;
+                  while (op < endOfMatch)
+                     *op++ = *copyFrom++;
+               } else {
+                  LZ4_memcpy(op, lowPrefix, restSize);
+                  op += restSize;
+               }
             }
-            op += 8;
-
-            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
-                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
-                if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
-                if (op < oCopyLimit) {
-                    LZ4_wildCopy8(op, match, oCopyLimit);
-                    match += oCopyLimit - op;
-                    op = oCopyLimit;
-                }
-                while (op < cpy) { *op++ = *match++; }
+            continue;
+         }
+         assert(match >= lowPrefix);
+
+         /* copy match within block */
+         cpy = op + length;
+
+         /* partialDecoding : may end anywhere within the block */
+         assert(op <= oend);
+         if (partialDecoding && (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
+            size_t const      mlen     = MIN(length, (size_t)(oend - op));
+            const BYTE* const matchEnd = match + mlen;
+            BYTE* const       copyEnd  = op + mlen;
+            if (matchEnd > op) { /* overlap copy */
+               while (op < copyEnd) {
+                  *op++ = *match++;
+               }
             } else {
-                LZ4_memcpy(op, match, 8);
-                if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); }
+               LZ4_memcpy(op, match, mlen);
+            }
+            op = copyEnd;
+            if (op == oend) {
+               break;
+            }
+            continue;
+         }
+
+         if (unlikely(offset < 8)) {
+            LZ4_write32(op, 0); /* silence msan warning when offset==0 */
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += inc32table[offset];
+            LZ4_memcpy(op + 4, match, 4);
+            match -= dec64table[offset];
+         } else {
+            LZ4_memcpy(op, match, 8);
+            match += 8;
+         }
+         op += 8;
+
+         if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
+            BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
+            if (cpy > oend - LASTLITERALS) {
+               goto _output_error;
+            } /* Error : last LASTLITERALS bytes must be literals (uncompressed)
+               */
+            if (op < oCopyLimit) {
+               LZ4_wildCopy8(op, match, oCopyLimit);
+               match += oCopyLimit - op;
+               op = oCopyLimit;
+            }
+            while (op < cpy) {
+               *op++ = *match++;
             }
-            op = cpy;   /* wildcopy correction */
-        }
+         } else {
+            LZ4_memcpy(op, match, 8);
+            if (length > 16) {
+               LZ4_wildCopy8(op + 8, match + 8, cpy);
+            }
+         }
+         op = cpy; /* wildcopy correction */
+      }
 
-        /* end of decoding */
-        DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
-        return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+      /* end of decoding */
+      DEBUGLOG(5, "decoded %i bytes", (int)(((char*)op) - dst));
+      return (int)(((char*)op) - dst); /* Nb of output bytes decoded */
 
-        /* Overflow error detected */
-    _output_error:
-        return (int) (-(((const char*)ip)-src))-1;
-    }
+      /* Overflow error detected */
+   _output_error:
+      return (int)(-(((const char*)ip) - src)) - 1;
+   }
 }
 
-
 /*===== Instantiate the API decoding functions. =====*/
 
 LZ4_FORCE_O2
-int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+int LZ4_decompress_safe(const char* source,
+                        char*       dest,
+                        int         compressedSize,
+                        int         maxDecompressedSize)
 {
-    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
-                                  decode_full_block, noDict,
-                                  (BYTE*)dest, NULL, 0);
+   return LZ4_decompress_generic(source, dest, compressedSize,
+                                 maxDecompressedSize, decode_full_block, noDict,
+                                 (BYTE*)dest, NULL, 0);
 }
 
 LZ4_FORCE_O2
-int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+int LZ4_decompress_safe_partial(const char* src,
+                                char*       dst,
+                                int         compressedSize,
+                                int         targetOutputSize,
+                                int         dstCapacity)
 {
-    dstCapacity = MIN(targetOutputSize, dstCapacity);
-    return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
-                                  partial_decode,
-                                  noDict, (BYTE*)dst, NULL, 0);
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                 partial_decode, noDict, (BYTE*)dst, NULL, 0);
 }
 
 LZ4_FORCE_O2
 int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
 {
-    DEBUGLOG(5, "LZ4_decompress_fast");
-    return LZ4_decompress_unsafe_generic(
-                (const BYTE*)source, (BYTE*)dest, originalSize,
-                0, NULL, 0);
+   DEBUGLOG(5, "LZ4_decompress_fast");
+   return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                        originalSize, 0, NULL, 0);
 }
 
 /*===== Instantiate a few more decoding cases, used more than once. =====*/
 
 LZ4_FORCE_O2 /* Exported, an obsolete API function. */
-int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+   int
+   LZ4_decompress_safe_withPrefix64k(const char* source,
+                                     char*       dest,
+                                     int         compressedSize,
+                                     int         maxOutputSize)
 {
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
-                                  decode_full_block, withPrefix64k,
-                                  (BYTE*)dest - 64 KB, NULL, 0);
+   return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                 decode_full_block, withPrefix64k,
+                                 (BYTE*)dest - 64 KB, NULL, 0);
 }
 
 LZ4_FORCE_O2
-static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity)
+static int LZ4_decompress_safe_partial_withPrefix64k(const char* source,
+                                                     char*       dest,
+                                                     int         compressedSize,
+                                                     int targetOutputSize,
+                                                     int dstCapacity)
 {
-    dstCapacity = MIN(targetOutputSize, dstCapacity);
-    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
-                                  partial_decode, withPrefix64k,
-                                  (BYTE*)dest - 64 KB, NULL, 0);
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                 partial_decode, withPrefix64k,
+                                 (BYTE*)dest - 64 KB, NULL, 0);
 }
 
 /* Another obsolete API function, paired with the previous one. */
-int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+int LZ4_decompress_fast_withPrefix64k(const char* source,
+                                      char*       dest,
+                                      int         originalSize)
 {
-    return LZ4_decompress_unsafe_generic(
-                (const BYTE*)source, (BYTE*)dest, originalSize,
-                64 KB, NULL, 0);
+   return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                        originalSize, 64 KB, NULL, 0);
 }
 
 LZ4_FORCE_O2
-static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
-                                               size_t prefixSize)
+static int LZ4_decompress_safe_withSmallPrefix(const char* source,
+                                               char*       dest,
+                                               int         compressedSize,
+                                               int         maxOutputSize,
+                                               size_t      prefixSize)
 {
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
-                                  decode_full_block, noDict,
-                                  (BYTE*)dest-prefixSize, NULL, 0);
+   return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                 decode_full_block, noDict,
+                                 (BYTE*)dest - prefixSize, NULL, 0);
 }
 
 LZ4_FORCE_O2
-static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity,
-                                               size_t prefixSize)
+static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source,
+                                                       char*       dest,
+                                                       int    compressedSize,
+                                                       int    targetOutputSize,
+                                                       int    dstCapacity,
+                                                       size_t prefixSize)
 {
-    dstCapacity = MIN(targetOutputSize, dstCapacity);
-    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
-                                  partial_decode, noDict,
-                                  (BYTE*)dest-prefixSize, NULL, 0);
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                 partial_decode, noDict,
+                                 (BYTE*)dest - prefixSize, NULL, 0);
 }
 
 LZ4_FORCE_O2
-int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
-                                     int compressedSize, int maxOutputSize,
-                                     const void* dictStart, size_t dictSize)
+int LZ4_decompress_safe_forceExtDict(const char* source,
+                                     char*       dest,
+                                     int         compressedSize,
+                                     int         maxOutputSize,
+                                     const void* dictStart,
+                                     size_t      dictSize)
 {
-    DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict");
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
-                                  decode_full_block, usingExtDict,
-                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+   DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict");
+   return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                 decode_full_block, usingExtDict, (BYTE*)dest,
+                                 (const BYTE*)dictStart, dictSize);
 }
 
 LZ4_FORCE_O2
-int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
-                                     int compressedSize, int targetOutputSize, int dstCapacity,
-                                     const void* dictStart, size_t dictSize)
+int LZ4_decompress_safe_partial_forceExtDict(const char* source,
+                                             char*       dest,
+                                             int         compressedSize,
+                                             int         targetOutputSize,
+                                             int         dstCapacity,
+                                             const void* dictStart,
+                                             size_t      dictSize)
 {
-    dstCapacity = MIN(targetOutputSize, dstCapacity);
-    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
-                                  partial_decode, usingExtDict,
-                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                 partial_decode, usingExtDict, (BYTE*)dest,
+                                 (const BYTE*)dictStart, dictSize);
 }
 
 LZ4_FORCE_O2
-static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
-                                       const void* dictStart, size_t dictSize)
+static int LZ4_decompress_fast_extDict(const char* source,
+                                       char*       dest,
+                                       int         originalSize,
+                                       const void* dictStart,
+                                       size_t      dictSize)
 {
-    return LZ4_decompress_unsafe_generic(
-                (const BYTE*)source, (BYTE*)dest, originalSize,
-                0, (const BYTE*)dictStart, dictSize);
+   return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                        originalSize, 0, (const BYTE*)dictStart,
+                                        dictSize);
 }
 
 /* The "double dictionary" mode, for use with e.g. ring buffers: the first part
- * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
- * These routines are used only once, in LZ4_decompress_*_continue().
+ * of the dictionary is passed as prefix, and the second via dictStart +
+ * dictSize. These routines are used only once, in LZ4_decompress_*_continue().
  */
 LZ4_FORCE_INLINE
-int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
-                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+int LZ4_decompress_safe_doubleDict(const char* source,
+                                   char*       dest,
+                                   int         compressedSize,
+                                   int         maxOutputSize,
+                                   size_t      prefixSize,
+                                   const void* dictStart,
+                                   size_t      dictSize)
 {
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
-                                  decode_full_block, usingExtDict,
-                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+   return LZ4_decompress_generic(
+      source, dest, compressedSize, maxOutputSize, decode_full_block,
+      usingExtDict, (BYTE*)dest - prefixSize, (const BYTE*)dictStart, dictSize);
 }
 
 /*===== streaming decompression functions =====*/
@@ -2571,37 +3093,42 @@ int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compresse
 #if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
 LZ4_streamDecode_t* LZ4_createStreamDecode(void)
 {
-    LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal));
-    return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+   LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >=
+                     sizeof(LZ4_streamDecode_t_internal));
+   return (LZ4_streamDecode_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
 }
 
-int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+int LZ4_freeStreamDecode(LZ4_streamDecode_t* LZ4_stream)
 {
-    if (LZ4_stream == NULL) { return 0; }  /* support free on NULL */
-    FREEMEM(LZ4_stream);
-    return 0;
+   if (LZ4_stream == NULL) {
+      return 0;
+   } /* support free on NULL */
+   FREEMEM(LZ4_stream);
+   return 0;
 }
 #endif
 
 /*! LZ4_setStreamDecode() :
  *  Use this function to instruct where to find the dictionary.
- *  This function is not necessary if previous data is still available where it was decoded.
- *  Loading a size of 0 is allowed (same effect as no dictionary).
+ *  This function is not necessary if previous data is still available where it
+ * was decoded. Loading a size of 0 is allowed (same effect as no dictionary).
  * @return : 1 if OK, 0 if error
  */
-int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
-    lz4sd->prefixSize = (size_t)dictSize;
-    if (dictSize) {
-        assert(dictionary != NULL);
-        lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
-    } else {
-        lz4sd->prefixEnd = (const BYTE*) dictionary;
-    }
-    lz4sd->externalDict = NULL;
-    lz4sd->extDictSize  = 0;
-    return 1;
+int LZ4_setStreamDecode(LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char*         dictionary,
+                        int                 dictSize)
+{
+   LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+   lz4sd->prefixSize                  = (size_t)dictSize;
+   if (dictSize) {
+      assert(dictionary != NULL);
+      lz4sd->prefixEnd = (const BYTE*)dictionary + dictSize;
+   } else {
+      lz4sd->prefixEnd = (const BYTE*)dictionary;
+   }
+   lz4sd->externalDict = NULL;
+   lz4sd->extDictSize  = 0;
+   return 1;
 }
 
 /*! LZ4_decoderRingBufferSize() :
@@ -2617,101 +3144,118 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
  */
 int LZ4_decoderRingBufferSize(int maxBlockSize)
 {
-    if (maxBlockSize < 0) return 0;
-    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
-    if (maxBlockSize < 16) maxBlockSize = 16;
-    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+   if (maxBlockSize < 0)
+      return 0;
+   if (maxBlockSize > LZ4_MAX_INPUT_SIZE)
+      return 0;
+   if (maxBlockSize < 16)
+      maxBlockSize = 16;
+   return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
 }
 
 /*
 *_continue() :
-    These decoding functions allow decompression of multiple blocks in "streaming" mode.
-    Previously decoded blocks must still be available at the memory position where they were decoded.
-    If it's not possible, save the relevant part of decoded data into a safe buffer,
-    and indicate where it stands using LZ4_setStreamDecode()
+    These decoding functions allow decompression of multiple blocks in
+"streaming" mode. Previously decoded blocks must still be available at the
+memory position where they were decoded. If it's not possible, save the relevant
+part of decoded data into a safe buffer, and indicate where it stands using
+LZ4_setStreamDecode()
 */
 LZ4_FORCE_O2
-int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
-    int result;
-
-    if (lz4sd->prefixSize == 0) {
-        /* The first call, no dictionary yet. */
-        assert(lz4sd->extDictSize == 0);
-        result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize = (size_t)result;
-        lz4sd->prefixEnd = (BYTE*)dest + result;
-    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
-        /* They're rolling the current segment. */
-        if (lz4sd->prefixSize >= 64 KB - 1)
-            result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
-        else if (lz4sd->extDictSize == 0)
-            result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize,
-                                                         lz4sd->prefixSize);
-        else
-            result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize,
-                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize += (size_t)result;
-        lz4sd->prefixEnd  += result;
-    } else {
-        /* The buffer wraps around, or they're switching to another buffer. */
-        lz4sd->extDictSize = lz4sd->prefixSize;
-        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-        result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize,
-                                                  lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize = (size_t)result;
-        lz4sd->prefixEnd  = (BYTE*)dest + result;
-    }
-
-    return result;
+int LZ4_decompress_safe_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                                 const char*         source,
+                                 char*               dest,
+                                 int                 compressedSize,
+                                 int                 maxOutputSize)
+{
+   LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+   int                          result;
+
+   if (lz4sd->prefixSize == 0) {
+      /* The first call, no dictionary yet. */
+      assert(lz4sd->extDictSize == 0);
+      result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)result;
+      lz4sd->prefixEnd  = (BYTE*)dest + result;
+   } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+      /* They're rolling the current segment. */
+      if (lz4sd->prefixSize >= 64 KB - 1)
+         result = LZ4_decompress_safe_withPrefix64k(
+            source, dest, compressedSize, maxOutputSize);
+      else if (lz4sd->extDictSize == 0)
+         result = LZ4_decompress_safe_withSmallPrefix(
+            source, dest, compressedSize, maxOutputSize, lz4sd->prefixSize);
+      else
+         result = LZ4_decompress_safe_doubleDict(
+            source, dest, compressedSize, maxOutputSize, lz4sd->prefixSize,
+            lz4sd->externalDict, lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize += (size_t)result;
+      lz4sd->prefixEnd += result;
+   } else {
+      /* The buffer wraps around, or they're switching to another buffer. */
+      lz4sd->extDictSize  = lz4sd->prefixSize;
+      lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+      result              = LZ4_decompress_safe_forceExtDict(
+                      source, dest, compressedSize, maxOutputSize, lz4sd->externalDict,
+                      lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)result;
+      lz4sd->prefixEnd  = (BYTE*)dest + result;
+   }
+
+   return result;
 }
 
 LZ4_FORCE_O2 int
-LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode,
-                        const char* source, char* dest, int originalSize)
-{
-    LZ4_streamDecode_t_internal* const lz4sd =
-        (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse);
-    int result;
-
-    DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
-    assert(originalSize >= 0);
-
-    if (lz4sd->prefixSize == 0) {
-        DEBUGLOG(5, "first invocation : no prefix nor extDict");
-        assert(lz4sd->extDictSize == 0);
-        result = LZ4_decompress_fast(source, dest, originalSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize = (size_t)originalSize;
-        lz4sd->prefixEnd = (BYTE*)dest + originalSize;
-    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
-        DEBUGLOG(5, "continue using existing prefix");
-        result = LZ4_decompress_unsafe_generic(
-                        (const BYTE*)source, (BYTE*)dest, originalSize,
-                        lz4sd->prefixSize,
-                        lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize += (size_t)originalSize;
-        lz4sd->prefixEnd  += originalSize;
-    } else {
-        DEBUGLOG(5, "prefix becomes extDict");
-        lz4sd->extDictSize = lz4sd->prefixSize;
-        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-        result = LZ4_decompress_fast_extDict(source, dest, originalSize,
-                                             lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize = (size_t)originalSize;
-        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
-    }
-
-    return result;
+LZ4_decompress_fast_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                             const char*         source,
+                             char*               dest,
+                             int                 originalSize)
+{
+   LZ4_streamDecode_t_internal* const lz4sd =
+      (assert(LZ4_streamDecode != NULL), &LZ4_streamDecode->internal_donotuse);
+   int result;
+
+   DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
+   assert(originalSize >= 0);
+
+   if (lz4sd->prefixSize == 0) {
+      DEBUGLOG(5, "first invocation : no prefix nor extDict");
+      assert(lz4sd->extDictSize == 0);
+      result = LZ4_decompress_fast(source, dest, originalSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)originalSize;
+      lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+   } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+      DEBUGLOG(5, "continue using existing prefix");
+      result = LZ4_decompress_unsafe_generic(
+         (const BYTE*)source, (BYTE*)dest, originalSize, lz4sd->prefixSize,
+         lz4sd->externalDict, lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize += (size_t)originalSize;
+      lz4sd->prefixEnd += originalSize;
+   } else {
+      DEBUGLOG(5, "prefix becomes extDict");
+      lz4sd->extDictSize  = lz4sd->prefixSize;
+      lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+      result              = LZ4_decompress_fast_extDict(
+                      source, dest, originalSize, lz4sd->externalDict, lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)originalSize;
+      lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+   }
+
+   return result;
 }
 
-
 /*
 Advanced decoding functions :
 *_usingDict() :
@@ -2719,74 +3263,112 @@ Advanced decoding functions :
     the dictionary must be explicitly provided within parameters
 */
 
-int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
-{
-    if (dictSize==0)
-        return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
-    if (dictStart+dictSize == dest) {
-        if (dictSize >= 64 KB - 1) {
-            return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
-        }
-        assert(dictSize >= 0);
-        return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
-    }
-    assert(dictSize >= 0);
-    return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
-}
-
-int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize)
-{
-    if (dictSize==0)
-        return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity);
-    if (dictStart+dictSize == dest) {
-        if (dictSize >= 64 KB - 1) {
-            return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity);
-        }
-        assert(dictSize >= 0);
-        return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize);
-    }
-    assert(dictSize >= 0);
-    return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize);
-}
-
-int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
-{
-    if (dictSize==0 || dictStart+dictSize == dest)
-        return LZ4_decompress_unsafe_generic(
-                        (const BYTE*)source, (BYTE*)dest, originalSize,
-                        (size_t)dictSize, NULL, 0);
-    assert(dictSize >= 0);
-    return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize);
+int LZ4_decompress_safe_usingDict(const char* source,
+                                  char*       dest,
+                                  int         compressedSize,
+                                  int         maxOutputSize,
+                                  const char* dictStart,
+                                  int         dictSize)
+{
+   if (dictSize == 0)
+      return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+   if (dictStart + dictSize == dest) {
+      if (dictSize >= 64 KB - 1) {
+         return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize,
+                                                  maxOutputSize);
+      }
+      assert(dictSize >= 0);
+      return LZ4_decompress_safe_withSmallPrefix(
+         source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+   }
+   assert(dictSize >= 0);
+   return LZ4_decompress_safe_forceExtDict(
+      source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_safe_partial_usingDict(const char* source,
+                                          char*       dest,
+                                          int         compressedSize,
+                                          int         targetOutputSize,
+                                          int         dstCapacity,
+                                          const char* dictStart,
+                                          int         dictSize)
+{
+   if (dictSize == 0)
+      return LZ4_decompress_safe_partial(source, dest, compressedSize,
+                                         targetOutputSize, dstCapacity);
+   if (dictStart + dictSize == dest) {
+      if (dictSize >= 64 KB - 1) {
+         return LZ4_decompress_safe_partial_withPrefix64k(
+            source, dest, compressedSize, targetOutputSize, dstCapacity);
+      }
+      assert(dictSize >= 0);
+      return LZ4_decompress_safe_partial_withSmallPrefix(
+         source, dest, compressedSize, targetOutputSize, dstCapacity,
+         (size_t)dictSize);
+   }
+   assert(dictSize >= 0);
+   return LZ4_decompress_safe_partial_forceExtDict(
+      source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart,
+      (size_t)dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source,
+                                  char*       dest,
+                                  int         originalSize,
+                                  const char* dictStart,
+                                  int         dictSize)
+{
+   if (dictSize == 0 || dictStart + dictSize == dest)
+      return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                           originalSize, (size_t)dictSize, NULL,
+                                           0);
+   assert(dictSize >= 0);
+   return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart,
+                                      (size_t)dictSize);
 }
 
-
 /*=*************************************************
-*  Obsolete Functions
-***************************************************/
+ *  Obsolete Functions
+ ***************************************************/
 /* obsolete compression functions */
-int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+int LZ4_compress_limitedOutput(const char* source,
+                               char*       dest,
+                               int         inputSize,
+                               int         maxOutputSize)
 {
-    return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+   return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
 }
 int LZ4_compress(const char* src, char* dest, int srcSize)
 {
-    return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
+   return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
 }
-int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize)
+int LZ4_compress_limitedOutput_withState(
+   void* state, const char* src, char* dst, int srcSize, int dstSize)
 {
-    return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+   return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
 }
-int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize)
+int LZ4_compress_withState(void* state, const char* src, char* dst, int srcSize)
 {
-    return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1);
+   return LZ4_compress_fast_extState(state, src, dst, srcSize,
+                                     LZ4_compressBound(srcSize), 1);
 }
-int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity)
+int LZ4_compress_limitedOutput_continue(LZ4_stream_t* LZ4_stream,
+                                        const char*   src,
+                                        char*         dst,
+                                        int           srcSize,
+                                        int           dstCapacity)
 {
-    return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1);
+   return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity,
+                                     1);
 }
-int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+int LZ4_compress_continue(LZ4_stream_t* LZ4_stream,
+                          const char*   source,
+                          char*         dest,
+                          int           inputSize)
 {
-    return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1);
+   return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize,
+                                     LZ4_compressBound(inputSize), 1);
 }
 
 /*
@@ -2795,13 +3377,16 @@ They are only provided here for compatibility with older user programs.
 - LZ4_uncompress is totally equivalent to LZ4_decompress_fast
 - LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
 */
-int LZ4_uncompress (const char* source, char* dest, int outputSize)
+int LZ4_uncompress(const char* source, char* dest, int outputSize)
 {
-    return LZ4_decompress_fast(source, dest, outputSize);
+   return LZ4_decompress_fast(source, dest, outputSize);
 }
-int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+int LZ4_uncompress_unknownOutputSize(const char* source,
+                                     char*       dest,
+                                     int         isize,
+                                     int         maxOutputSize)
 {
-    return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+   return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
 }
 
 /* Obsolete Streaming functions */
@@ -2810,23 +3395,23 @@ int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); }
 
 int LZ4_resetStreamState(void* state, char* inputBuffer)
 {
-    (void)inputBuffer;
-    LZ4_resetStream((LZ4_stream_t*)state);
-    return 0;
+   (void)inputBuffer;
+   LZ4_resetStream((LZ4_stream_t*)state);
+   return 0;
 }
 
 #if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
-void* LZ4_create (char* inputBuffer)
+void* LZ4_create(char* inputBuffer)
 {
-    (void)inputBuffer;
-    return LZ4_createStream();
+   (void)inputBuffer;
+   return LZ4_createStream();
 }
 #endif
 
-char* LZ4_slideInputBuffer (void* state)
+char* LZ4_slideInputBuffer(void* state)
 {
-    /* avoid const char * -> char * conversion warning */
-    return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+   /* avoid const char * -> char * conversion warning */
+   return (char*)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
 }
 
-#endif   /* LZ4_COMMONDEFS_ONLY */
+#endif /* LZ4_COMMONDEFS_ONLY */
diff --git a/tracegrind/lz4.h b/tracegrind/lz4.h
index 7f2a89d40..a08439161 100644
--- a/tracegrind/lz4.h
+++ b/tracegrind/lz4.h
@@ -32,7 +32,7 @@
     - LZ4 homepage : http://www.lz4.org
     - LZ4 source repository : https://github.com/lz4/lz4
 */
-#if defined (__cplusplus)
+#if defined(__cplusplus)
 extern "C" {
 #endif
 
@@ -41,20 +41,19 @@ extern "C" {
 
 /* --- Dependency --- */
 #if !LZ4_FREESTANDING
-#include <stddef.h>   /* size_t */
+#include <stddef.h> /* size_t */
 #endif
 
-
 /**
   Introduction
 
-  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
-  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
-  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s
+  per core, scalable with multi-cores CPU. It features an extremely fast
+  decoder, with speed in multiple GB/s per core, typically reaching RAM speed
+  limits on multi-core systems.
 
-  The LZ4 compression library provides in-memory compression and decompression functions.
-  It gives full buffer control to user.
-  Compression can be done in:
+  The LZ4 compression library provides in-memory compression and decompression
+  functions. It gives full buffer control to user. Compression can be done in:
     - a single step (described as Simple Functions)
     - a single step, reusing a context (described in Advanced Functions)
     - unbounded multiple steps (described as Streaming compression)
@@ -63,40 +62,45 @@ extern "C" {
   Decompressing such a compressed block requires additional metadata.
   Exact metadata depends on exact decompression function.
   For the typical case of LZ4_decompress_safe(),
-  metadata includes block's compressed size, and maximum bound of decompressed size.
-  Each application is free to encode and pass such metadata in whichever way it wants.
+  metadata includes block's compressed size, and maximum bound of decompressed
+  size. Each application is free to encode and pass such metadata in whichever
+  way it wants.
 
   lz4.h only handle blocks, it can not generate Frames.
 
   Blocks are different from Frames (doc/lz4_Frame_format.md).
   Frames bundle both blocks and metadata in a specified manner.
-  Embedding metadata is required for compressed data to be self-contained and portable.
-  Frame format is delivered through a companion API, declared in lz4frame.h.
-  The `lz4` CLI can only manage frames.
+  Embedding metadata is required for compressed data to be self-contained and
+  portable. Frame format is delivered through a companion API, declared in
+  lz4frame.h. The `lz4` CLI can only manage frames.
 */
 
 /*^***************************************************************
-*  Export parameters
-*****************************************************************/
+ *  Export parameters
+ *****************************************************************/
 /*
-*  LZ4_DLL_EXPORT :
-*  Enable exporting of functions when building a Windows DLL
-*  LZ4LIB_VISIBILITY :
-*  Control library symbols visibility.
-*/
+ *  LZ4_DLL_EXPORT :
+ *  Enable exporting of functions when building a Windows DLL
+ *  LZ4LIB_VISIBILITY :
+ *  Control library symbols visibility.
+ */
 #ifndef LZ4LIB_VISIBILITY
-#  if defined(__GNUC__) && (__GNUC__ >= 4)
-#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
-#  else
-#    define LZ4LIB_VISIBILITY
-#  endif
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4LIB_VISIBILITY __attribute__((visibility("default")))
+#else
+#define LZ4LIB_VISIBILITY
+#endif
 #endif
-#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
-#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
-#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
-#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT == 1)
+#define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT == 1)
+#define LZ4LIB_API                                                             \
+   __declspec(dllimport)                                                       \
+      LZ4LIB_VISIBILITY /* It isn't required but allows to generate better     \
+                           code, saving a function pointer load from the IAT   \
+                           and an indirect jump.*/
 #else
-#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#define LZ4LIB_API LZ4LIB_VISIBILITY
 #endif
 
 /*! LZ4_FREESTANDING :
@@ -112,155 +116,184 @@ extern "C" {
  *  - See tests/freestanding.c to check its basic setup.
  */
 #if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
-#  define LZ4_HEAPMODE 0
-#  define LZ4HC_HEAPMODE 0
-#  define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
-#  if !defined(LZ4_memcpy)
-#    error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
-#  endif
-#  if !defined(LZ4_memset)
-#    error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
-#  endif
-#  if !defined(LZ4_memmove)
-#    error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
-#  endif
-#elif ! defined(LZ4_FREESTANDING)
-#  define LZ4_FREESTANDING 0
+#define LZ4_HEAPMODE                                      0
+#define LZ4HC_HEAPMODE                                    0
+#define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
+#if !defined(LZ4_memcpy)
+#error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
+#endif
+#if !defined(LZ4_memset)
+#error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
+#endif
+#if !defined(LZ4_memmove)
+#error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
+#endif
+#elif !defined(LZ4_FREESTANDING)
+#define LZ4_FREESTANDING 0
 #endif
-
 
 /*------   Version   ------*/
-#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
-#define LZ4_VERSION_MINOR   10    /* for new (non-breaking) interface capabilities */
-#define LZ4_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
-
-#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
-
-#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
-#define LZ4_QUOTE(str) #str
+#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR                                                      \
+   10                         /* for new (non-breaking) interface capabilities \
+                               */
+#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER                                                     \
+   (LZ4_VERSION_MAJOR * 100 * 100 + LZ4_VERSION_MINOR * 100 +                  \
+    LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION           LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str)            #str
 #define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
-#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)  /* requires v1.7.3+ */
-
-LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version; requires v1.3.0+ */
-LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version; requires v1.7.5+ */
+#define LZ4_VERSION_STRING                                                     \
+   LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) /* requires v1.7.3+ */
 
+LZ4LIB_API int
+LZ4_versionNumber(void); /**< library version number; useful to check dll
+                            version; requires v1.3.0+ */
+LZ4LIB_API const char*
+LZ4_versionString(void); /**< library version string; useful to check dll
+                            version; requires v1.7.5+ */
 
 /*-************************************
-*  Tuning memory usage
-**************************************/
+ *  Tuning memory usage
+ **************************************/
 /*!
  * LZ4_MEMORY_USAGE :
  * Can be selected at compile time, by setting LZ4_MEMORY_USAGE.
- * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB)
- * Increasing memory usage improves compression ratio, generally at the cost of speed.
- * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality.
- * Default value is 14, for 16KB, which nicely fits into most L1 caches.
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 ->
+ * 64KB; 20 -> 1MB) Increasing memory usage improves compression ratio,
+ * generally at the cost of speed. Reduced memory usage may improve speed at the
+ * cost of ratio, thanks to better cache locality. Default value is 14, for
+ * 16KB, which nicely fits into most L1 caches.
  */
 #ifndef LZ4_MEMORY_USAGE
-# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+#define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
 #endif
 
 /* These are absolute limits, they should not be changed by users */
-#define LZ4_MEMORY_USAGE_MIN 10
+#define LZ4_MEMORY_USAGE_MIN     10
 #define LZ4_MEMORY_USAGE_DEFAULT 14
-#define LZ4_MEMORY_USAGE_MAX 20
+#define LZ4_MEMORY_USAGE_MAX     20
 
 #if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
-#  error "LZ4_MEMORY_USAGE is too small !"
+#error "LZ4_MEMORY_USAGE is too small !"
 #endif
 
 #if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
-#  error "LZ4_MEMORY_USAGE is too large !"
+#error "LZ4_MEMORY_USAGE is too large !"
 #endif
 
 /*-************************************
-*  Simple Functions
-**************************************/
+ *  Simple Functions
+ **************************************/
 /*! LZ4_compress_default() :
  *  Compresses 'srcSize' bytes from buffer 'src'
  *  into already allocated 'dst' buffer of size 'dstCapacity'.
- *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
- *  It also runs faster, so it's a recommended setting.
- *  If the function cannot compress 'src' into a more limited 'dst' budget,
- *  compression stops *immediately*, and the function result is zero.
- *  In which case, 'dst' content is undefined (invalid).
- *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
- *      dstCapacity : size of buffer 'dst' (which must be already allocated)
- *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
- *                or 0 if compression fails
- * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ *  Compression is guaranteed to succeed if 'dstCapacity' >=
+ * LZ4_compressBound(srcSize). It also runs faster, so it's a recommended
+ * setting. If the function cannot compress 'src' into a more limited 'dst'
+ * budget, compression stops *immediately*, and the function result is zero. In
+ * which case, 'dst' content is undefined (invalid). srcSize : max supported
+ * value is LZ4_MAX_INPUT_SIZE. dstCapacity : size of buffer 'dst' (which must
+ * be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <=
+ * dstCapacity) or 0 if compression fails Note : This function is protected
+ * against buffer overflow scenarios (never writes outside 'dst' buffer, nor
+ * read outside 'source' buffer).
  */
-LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+LZ4LIB_API int
+LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
 
 /*! LZ4_decompress_safe() :
  * @compressedSize : is the exact complete size of the compressed block.
- * @dstCapacity : is the size of destination buffer (which must be already allocated),
- *                presumed an upper bound of decompressed size.
- * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
- *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
- *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
- * Note 1 : This function is protected against malicious data packets :
- *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
- *          even if the compressed block is maliciously modified to order the decoder to do these actions.
- *          In such case, the decoder stops immediately, and considers the compressed block malformed.
- * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
- *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
- *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ * @dstCapacity : is the size of destination buffer (which must be already
+ * allocated), presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer
+ * (necessarily <= dstCapacity) If destination buffer is not large enough,
+ * decoding will stop and output an error code (negative value). If the source
+ * stream is detected malformed, the function will stop decoding and return a
+ * negative result. Note 1 : This function is protected against malicious data
+ * packets : it will never writes outside 'dst' buffer, nor read outside
+ * 'source' buffer, even if the compressed block is maliciously modified to
+ * order the decoder to do these actions. In such case, the decoder stops
+ * immediately, and considers the compressed block malformed. Note 2 :
+ * compressedSize and dstCapacity must be provided to the function, the
+ * compressed block does not contain them. The implementation is free to send /
+ * store / derive this information in whichever way is most beneficial. If there
+ * is a need for a different format which bundles together both compressed data
+ * and its metadata, consider looking at lz4frame.h instead.
  */
-LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
-
+LZ4LIB_API int LZ4_decompress_safe(const char* src,
+                                   char*       dst,
+                                   int         compressedSize,
+                                   int         dstCapacity);
 
 /*-************************************
-*  Advanced Functions
-**************************************/
-#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
-#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+ *  Advanced Functions
+ **************************************/
+#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)                                               \
+   ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE                           \
+       ? 0                                                                     \
+       : (isize) + ((isize) / 255) + 16)
 
 /*! LZ4_compressBound() :
-    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
-    This function is primarily useful for memory allocation purposes (destination buffer size).
-    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
-    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
-        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
-        return : maximum output size in a "worst case" scenario
-              or 0, if input size is incorrect (too large or negative)
+    Provides the maximum size that LZ4 compression may output in a "worst case"
+   scenario (input data not compressible) This function is primarily useful for
+   memory allocation purposes (destination buffer size). Macro
+   LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack
+   memory allocation for example). Note that LZ4_compress_default() compresses
+   faster when dstCapacity is >= LZ4_compressBound(srcSize) inputSize  : max
+   supported value is LZ4_MAX_INPUT_SIZE return : maximum output size in a
+   "worst case" scenario or 0, if input size is incorrect (too large or
+   negative)
 */
 LZ4LIB_API int LZ4_compressBound(int inputSize);
 
 /*! LZ4_compress_fast() :
-    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
-    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
-    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
-    An acceleration value of "1" is the same as regular LZ4_compress_default()
-    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
-    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+    Same as LZ4_compress_default(), but allows selection of "acceleration"
+   factor. The larger the acceleration value, the faster the algorithm, but also
+   the lesser the compression. It's a trade-off. It can be fine tuned, with each
+   successive value providing roughly +~3% to speed. An acceleration value of
+   "1" is the same as regular LZ4_compress_default() Values <= 0 will be
+   replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c). Values >
+   LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently ==
+   65537, see lz4.c).
 */
-LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
-
+LZ4LIB_API int LZ4_compress_fast(
+   const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
 
 /*! LZ4_compress_fast_extState() :
- *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
- *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for
+ * its state. Use LZ4_sizeofState() to know how much memory must be allocated,
  *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
  *  Then, provide this buffer as `void* state` to compression function.
  */
 LZ4LIB_API int LZ4_sizeofState(void);
-LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+LZ4LIB_API int LZ4_compress_fast_extState(void*       state,
+                                          const char* src,
+                                          char*       dst,
+                                          int         srcSize,
+                                          int         dstCapacity,
+                                          int         acceleration);
 
 /*! LZ4_compress_destSize() :
  *  Reverse the logic : compresses as much data as possible from 'src' buffer
  *  into already allocated buffer 'dst', of size >= 'dstCapacity'.
- *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
- *  or fill 'dst' buffer completely with as much data as possible from 'src'.
- *  note: acceleration parameter is fixed to "default".
+ *  This function either compresses the entire 'src' content into 'dst' if it's
+ * large enough, or fill 'dst' buffer completely with as much data as possible
+ * from 'src'. note: acceleration parameter is fixed to "default".
  *
  * *srcSizePtr : in+out parameter. Initially contains size of input.
- *               Will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
- *               New value is necessarily <= input value.
+ *               Will be modified to indicate how many bytes where read from
+ * 'src' to fill 'dst'. New value is necessarily <= input value.
  * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity)
  *           or 0 if compression fails.
  *
- * Note : 'targetDstSize' must be >= 1, because it's the smallest valid lz4 payload.
+ * Note : 'targetDstSize' must be >= 1, because it's the smallest valid lz4
+ * payload.
  *
  * Note 2:from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+):
  *        the produced compressed content could, in rare circumstances,
@@ -273,7 +306,10 @@ LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* d
  *        a dstCapacity which is > decompressedSize, by at least 1 byte.
  *        See https://github.com/lz4/lz4/issues/859 for details
  */
-LZ4LIB_API int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+LZ4LIB_API int LZ4_compress_destSize(const char* src,
+                                     char*       dst,
+                                     int*        srcSizePtr,
+                                     int         targetDstSize);
 
 /*! LZ4_decompress_safe_partial() :
  *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
@@ -283,22 +319,24 @@ LZ4LIB_API int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr
  *  This can be useful to boost performance
  *  whenever only the beginning of a block is required.
  *
- * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
- *           If source stream is detected malformed, function returns a negative result.
+ * @return : the number of bytes decoded in `dst` (necessarily <=
+ * targetOutputSize) If source stream is detected malformed, function returns a
+ * negative result.
  *
- *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains
+ * less data.
  *
  *  Note 2 : targetOutputSize must be <= dstCapacity
  *
- *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
- *           so dstCapacity is kind of redundant.
- *           This is because in older versions of this function,
- *           decoding operation would still write complete sequences.
- *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
- *           it could write more bytes, though only up to dstCapacity.
- *           Some "margin" used to be required for this operation to work properly.
- *           Thankfully, this is no longer necessary.
- *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *  Note 3 : this function effectively stops decoding on reaching
+ * targetOutputSize, so dstCapacity is kind of redundant. This is because in
+ * older versions of this function, decoding operation would still write
+ * complete sequences. Therefore, there was no guarantee that it would stop
+ * writing at exactly targetOutputSize, it could write more bytes, though only
+ * up to dstCapacity. Some "margin" used to be required for this operation to
+ * work properly. Thankfully, this is no longer necessary. The function
+ * nonetheless keeps the same signature, in an effort to preserve API
+ * compatibility.
  *
  *  Note 4 : If srcSize is the exact size of the block,
  *           then targetOutputSize can be any value,
@@ -309,18 +347,22 @@ LZ4LIB_API int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr
  *           then targetOutputSize **MUST** be <= block's decompressed size.
  *           Otherwise, *silent corruption will occur*.
  */
-LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
-
+LZ4LIB_API int LZ4_decompress_safe_partial(const char* src,
+                                           char*       dst,
+                                           int         srcSize,
+                                           int         targetOutputSize,
+                                           int         dstCapacity);
 
 /*-*********************************************
-*  Streaming Compression Functions
-***********************************************/
-typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+ *  Streaming Compression Functions
+ ***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */
 
 /*!
  Note about RC_INVOKED
 
- - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio).
+ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is
+ part of MSVC/Visual Studio).
    https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
 
  - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
@@ -330,10 +372,12 @@ typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
    "#if !defined(RC_INVOKED) ... #endif" block that means
    "skip this block when rc.exe is trying to read it".
 */
-#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(                                                                             \
+   RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros \
+                */
 #if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
 LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
-LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+LZ4LIB_API int           LZ4_freeStream(LZ4_stream_t* streamPtr);
 #endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
 #endif
 
@@ -357,31 +401,37 @@ LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
  *  Note: it's only useful to call LZ4_resetStream_fast()
  *        in the context of streaming compression.
  *        The *extState* functions perform their own resets.
- *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even
+ * counterproductive.
  */
-LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+LZ4LIB_API void LZ4_resetStream_fast(LZ4_stream_t* streamPtr);
 
 /*! LZ4_loadDict() :
  *  Use this function to reference a static dictionary into LZ4_stream_t.
  *  The dictionary must remain available during compression.
  *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
- *  The same dictionary will have to be loaded on decompression side for successful decoding.
- *  Dictionary are useful for better compression of small data (KB range).
- *  While LZ4 itself accepts any input as dictionary, dictionary efficiency is also a topic.
- *  When in doubt, employ the Zstandard's Dictionary Builder.
- *  Loading a size of 0 is allowed, and is the same as reset.
- * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+ *  The same dictionary will have to be loaded on decompression side for
+ * successful decoding. Dictionary are useful for better compression of small
+ * data (KB range). While LZ4 itself accepts any input as dictionary, dictionary
+ * efficiency is also a topic. When in doubt, employ the Zstandard's Dictionary
+ * Builder. Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are
+ * loaded)
  */
-LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+LZ4LIB_API int
+LZ4_loadDict(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
 
 /*! LZ4_loadDictSlow() : v1.10.0+
  *  Same as LZ4_loadDict(),
  *  but uses a bit more cpu to reference the dictionary content more thoroughly.
  *  This is expected to slightly improve compression ratio.
- *  The extra-cpu cost is likely worth it if the dictionary is re-used across multiple sessions.
- * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+ *  The extra-cpu cost is likely worth it if the dictionary is re-used across
+ * multiple sessions.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are
+ * loaded)
  */
-LZ4LIB_API int LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+LZ4LIB_API int
+LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
 
 /*! LZ4_attach_dictionary() : stable since v1.10.0
  *
@@ -406,150 +456,180 @@ LZ4LIB_API int LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary,
  *
  *  The dictionary will only remain attached to the working stream through the
  *  first compression call, at the end of which it is cleared.
- * @dictionaryStream stream (and source buffer) must remain in-place / accessible / unchanged
- *  through the completion of the compression session.
+ * @dictionaryStream stream (and source buffer) must remain in-place /
+ * accessible / unchanged through the completion of the compression session.
  *
  *  Note: there is no equivalent LZ4_attach_*() method on the decompression side
- *  because there is no initialization cost, hence no need to share the cost across multiple sessions.
- *  To decompress LZ4 blocks using dictionary, attached or not,
- *  just employ the regular LZ4_setStreamDecode() for streaming,
- *  or the stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
+ *  because there is no initialization cost, hence no need to share the cost
+ * across multiple sessions. To decompress LZ4 blocks using dictionary, attached
+ * or not, just employ the regular LZ4_setStreamDecode() for streaming, or the
+ * stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
  */
-LZ4LIB_API void
-LZ4_attach_dictionary(LZ4_stream_t* workingStream,
-                const LZ4_stream_t* dictionaryStream);
+LZ4LIB_API void LZ4_attach_dictionary(LZ4_stream_t*       workingStream,
+                                      const LZ4_stream_t* dictionaryStream);
 
 /*! LZ4_compress_fast_continue() :
- *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
- * 'dst' buffer must be already allocated.
- *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *  Compress 'src' content using data from previously compressed blocks, for
+ * better compression ratio. 'dst' buffer must be already allocated. If
+ * dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to
+ * succeed, and runs faster.
  *
  * @return : size of compressed block
  *           or 0 if there is an error (typically, cannot fit into 'dst').
  *
- *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
- *           Each block has precise boundaries.
- *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
- *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new
+ * block. Each block has precise boundaries. Each block must be decompressed
+ * separately, calling LZ4_decompress_*() with relevant metadata. It's not
+ * possible to append blocks together and expect a single invocation of
+ * LZ4_decompress_*() to decompress them together.
  *
- *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present,
+ * unmodified, at same address in memory !
  *
- *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
- *           Make sure that buffers are separated, by at least one byte.
- *           This construction ensures that each block only depends on previous block.
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have
+ * any size, including < 64 KB. Make sure that buffers are separated, by at
+ * least one byte. This construction ensures that each block only depends on
+ * previous block.
  *
- *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including <
+ * 64 KB.
  *
- *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can
+ * only be reset or freed.
  */
-LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+LZ4LIB_API int LZ4_compress_fast_continue(LZ4_stream_t* streamPtr,
+                                          const char*   src,
+                                          char*         dst,
+                                          int           srcSize,
+                                          int           dstCapacity,
+                                          int           acceleration);
 
 /*! LZ4_saveDict() :
- *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
- *  save it into a safer place (char* safeBuffer).
- *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
- *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
- * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ *  If last 64KB data cannot be guaranteed to remain available at its current
+ * memory location, save it into a safer place (char* safeBuffer). This is
+ * schematically equivalent to a memcpy() followed by LZ4_loadDict(), but is
+ * much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0
+ * if error.
  */
-LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
-
+LZ4LIB_API int
+LZ4_saveDict(LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
 
 /*-**********************************************
-*  Streaming Decompression Functions
-*  Bufferless synchronous API
-************************************************/
-typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+ *  Streaming Decompression Functions
+ *  Bufferless synchronous API
+ ************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */
 
 /*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
  *  creation / destruction of streaming decompression tracking context.
  *  A tracking context can be re-used multiple times.
  */
-#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(                                                                             \
+   RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros \
+                */
 #if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
 LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
-LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+LZ4LIB_API int LZ4_freeStreamDecode(LZ4_streamDecode_t* LZ4_stream);
 #endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
 #endif
 
 /*! LZ4_setStreamDecode() :
- *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
- *  Use this function to start decompression of a new stream of blocks.
- *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
- *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple
+ * times. Use this function to start decompression of a new stream of blocks. A
+ * dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified
+ * during next decompression.
  * @return : 1 if OK, 0 if error
  */
-LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+LZ4LIB_API int LZ4_setStreamDecode(LZ4_streamDecode_t* LZ4_streamDecode,
+                                   const char*         dictionary,
+                                   int                 dictSize);
 
 /*! LZ4_decoderRingBufferSize() : v1.8.2+
  *  Note : in a ring buffer scenario (optional),
  *  blocks are presumed decompressed next to each other
- *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
- *  at which stage it resumes from beginning of ring buffer.
- *  When setting such a ring buffer for streaming decompression,
+ *  up to the moment there is not enough remaining space for next block
+ * (remainingSize < maxBlockSize), at which stage it resumes from beginning of
+ * ring buffer. When setting such a ring buffer for streaming decompression,
  *  provides the minimum size of this ring buffer
  *  to be compatible with any source respecting maxBlockSize condition.
  * @return : minimum ring buffer size,
  *           or 0 if there is an error (invalid maxBlockSize).
  */
 LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
-#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize)                             \
+   (65536 + 14 +                                                               \
+    (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */
 
 /*! LZ4_decompress_safe_continue() :
- *  This decoding function allows decompression of consecutive blocks in "streaming" mode.
- *  The difference with the usual independent blocks is that
+ *  This decoding function allows decompression of consecutive blocks in
+ * "streaming" mode. The difference with the usual independent blocks is that
  *  new blocks are allowed to find references into former blocks.
- *  A block is an unsplittable entity, and must be presented entirely to the decompression function.
- *  LZ4_decompress_safe_continue() only accepts one block at a time.
- *  It's modeled after `LZ4_decompress_safe()` and behaves similarly.
+ *  A block is an unsplittable entity, and must be presented entirely to the
+ * decompression function. LZ4_decompress_safe_continue() only accepts one block
+ * at a time. It's modeled after `LZ4_decompress_safe()` and behaves similarly.
  *
- * @LZ4_streamDecode : decompression state, tracking the position in memory of past data
+ * @LZ4_streamDecode : decompression state, tracking the position in memory of
+ * past data
  * @compressedSize : exact complete size of one compressed block.
  * @dstCapacity : size of destination buffer (which must be already allocated),
  *                must be an upper bound of decompressed size.
- * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
- *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
- *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
- *
- *  The last 64KB of previously decoded data *must* remain available and unmodified
- *  at the memory position where they were previously decoded.
- *  If less than 64KB of data has been decoded, all the data must be present.
- *
- *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
- *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
- *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
- *    In which case, encoding and decoding buffers do not need to be synchronized.
- *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ * @return : number of bytes decompressed into destination buffer (necessarily
+ * <= dstCapacity) If destination buffer is not large enough, decoding will stop
+ * and output an error code (negative value). If the source stream is detected
+ * malformed, the function will stop decoding and return a negative result.
+ *
+ *  The last 64KB of previously decoded data *must* remain available and
+ * unmodified at the memory position where they were previously decoded. If less
+ * than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of
+ * the following conditions :
+ *  - Decompression buffer size is _at least_
+ * LZ4_decoderRingBufferSize(maxBlockSize). maxBlockSize is the maximum size of
+ * any single block. It can have any value > 16 bytes. In which case, encoding
+ * and decoding buffers do not need to be synchronized. Actually, data can be
+ * produced by any source compliant with LZ4 format specification, and
+ * respecting maxBlockSize.
  *  - Synchronized mode :
- *    Decompression buffer size is _exactly_ the same as compression buffer size,
- *    and follows exactly same update rule (block boundaries at same positions),
- *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
- *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
- *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
- *    In which case, encoding and decoding buffers do not need to be synchronized,
- *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *    Decompression buffer size is _exactly_ the same as compression buffer
+ * size, and follows exactly same update rule (block boundaries at same
+ * positions), and decoding function is provided with exact decompressed size of
+ * each block (exception for last block of the stream), _then_ decoding &
+ * encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of
+ * maxBlockSize more bytes. In which case, encoding and decoding buffers do not
+ * need to be synchronized, and encoding ring buffer can have any size,
+ * including small ones ( < 64 KB).
  *
  *  Whenever these conditions are not possible,
- *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
- *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
-*/
+ *  save the last 64KB of decoded data into a safe buffer where it can't be
+ * modified during decompression, then indicate where this data is saved using
+ * LZ4_setStreamDecode(), before decompressing next block.
+ */
 LZ4LIB_API int
-LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode,
-                        const char* src, char* dst,
-                        int srcSize, int dstCapacity);
-
+LZ4_decompress_safe_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                             const char*         src,
+                             char*               dst,
+                             int                 srcSize,
+                             int                 dstCapacity);
 
 /*! LZ4_decompress_safe_usingDict() :
  *  Works the same as
- *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue()
- *  However, it's stateless: it doesn't need any LZ4_streamDecode_t state.
- *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
- *  Performance tip : Decompression speed can be substantially increased
- *                    when dst == dictStart + dictSize.
+ *  a combination of LZ4_setStreamDecode() followed by
+ * LZ4_decompress_safe_continue() However, it's stateless: it doesn't need any
+ * LZ4_streamDecode_t state. Dictionary is presumed stable : it must remain
+ * accessible and unmodified during decompression. Performance tip :
+ * Decompression speed can be substantially increased when dst == dictStart +
+ * dictSize.
  */
-LZ4LIB_API int
-LZ4_decompress_safe_usingDict(const char* src, char* dst,
-                              int srcSize, int dstCapacity,
-                              const char* dictStart, int dictSize);
+LZ4LIB_API int LZ4_decompress_safe_usingDict(const char* src,
+                                             char*       dst,
+                                             int         srcSize,
+                                             int         dstCapacity,
+                                             const char* dictStart,
+                                             int         dictSize);
 
 /*! LZ4_decompress_safe_partial_usingDict() :
  *  Behaves the same as LZ4_decompress_safe_partial()
@@ -557,15 +637,16 @@ LZ4_decompress_safe_usingDict(const char* src, char* dst,
  *  Performance tip : Decompression speed can be substantially increased
  *                    when dst == dictStart + dictSize.
  */
-LZ4LIB_API int
-LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
-                                      int compressedSize,
-                                      int targetOutputSize, int maxOutputSize,
-                                      const char* dictStart, int dictSize);
+LZ4LIB_API int LZ4_decompress_safe_partial_usingDict(const char* src,
+                                                     char*       dst,
+                                                     int         compressedSize,
+                                                     int targetOutputSize,
+                                                     int maxOutputSize,
+                                                     const char* dictStart,
+                                                     int         dictSize);
 
 #endif /* LZ4_H_2983827168210 */
 
-
 /*^*************************************
  * !!!!!!   STATIC LINKING ONLY   !!!!!!
  ***************************************/
@@ -596,29 +677,39 @@ LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
 #define LZ4_STATIC_3504398509
 
 #ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
-# define LZ4LIB_STATIC_API LZ4LIB_API
+#define LZ4LIB_STATIC_API LZ4LIB_API
 #else
-# define LZ4LIB_STATIC_API
+#define LZ4LIB_STATIC_API
 #endif
 
-
 /*! LZ4_compress_fast_extState_fastReset() :
  *  A variant of LZ4_compress_fast_extState().
  *
  *  Using this variant avoids an expensive initialization step.
- *  It is only safe to call if the state buffer is known to be correctly initialized already
- *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
- *  From a high level, the difference is that
- *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
- *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ *  It is only safe to call if the state buffer is known to be correctly
+ * initialized already (see above comment on LZ4_resetStream_fast() for a
+ * definition of "correctly initialized"). From a high level, the difference is
+ * that this function initializes the provided state with a call to something
+ * like LZ4_resetStream_fast() while LZ4_compress_fast_extState() starts with a
+ * call to LZ4_resetStream().
  */
-LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset(void*       state,
+                                                           const char* src,
+                                                           char*       dst,
+                                                           int         srcSize,
+                                                           int dstCapacity,
+                                                           int acceleration);
 
 /*! LZ4_compress_destSize_extState() : introduced in v1.10.0
  *  Same as LZ4_compress_destSize(), but using an externally allocated state.
  *  Also: exposes @acceleration
  */
-int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration);
+int LZ4_compress_destSize_extState(void*       state,
+                                   const char* src,
+                                   char*       dst,
+                                   int*        srcSizePtr,
+                                   int         targetDstSize,
+                                   int         acceleration);
 
 /*! In-place compression and decompression
  *
@@ -641,50 +732,66 @@ int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int*
  * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
  * This presumes that decompressedSize > compressedSize.
  * Otherwise, it means compression actually expanded data,
- * and it would be more efficient to store such data with a flag indicating it's not compressed.
- * This can happen when data is not compressible (already compressed, or encrypted).
+ * and it would be more efficient to store such data with a flag indicating it's
+ * not compressed. This can happen when data is not compressible (already
+ * compressed, or encrypted).
  *
- * For in-place compression, margin is larger, as it must be able to cope with both
- * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
- * and data expansion, which can happen when input is not compressible.
- * As a consequence, buffer size requirements are much higher,
- * and memory savings offered by in-place compression are more limited.
+ * For in-place compression, margin is larger, as it must be able to cope with
+ * both history preservation, requiring input data to remain unmodified up to
+ * LZ4_DISTANCE_MAX, and data expansion, which can happen when input is not
+ * compressible. As a consequence, buffer size requirements are much higher, and
+ * memory savings offered by in-place compression are more limited.
  *
  * There are ways to limit this cost for compression :
  * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
- *   Note that it is a compile-time constant, so all compressions will apply this limit.
- *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
- *   so it's a reasonable trick when inputs are known to be small.
+ *   Note that it is a compile-time constant, so all compressions will apply
+ * this limit. Lower values will reduce compression ratio, except when
+ * input_size < LZ4_DISTANCE_MAX, so it's a reasonable trick when inputs are
+ * known to be small.
  * - Require the compressor to deliver a "maximum compressed size".
  *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
- *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
- *   in which case, the return code will be 0 (zero).
- *   The caller must be ready for these cases to happen,
- *   and typically design a backup scheme to send data uncompressed.
- * The combination of both techniques can significantly reduce
- * the amount of margin required for in-place compression.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can
+ * fail, in which case, the return code will be 0 (zero). The caller must be
+ * ready for these cases to happen, and typically design a backup scheme to send
+ * data uncompressed. The combination of both techniques can significantly
+ * reduce the amount of margin required for in-place compression.
  *
  * In-place compression can work in any buffer
  * which size is >= (maxCompressedSize)
- * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
- * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
- * so it's possible to reduce memory requirements by playing with them.
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed
+ * compression success. LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both
+ * maxCompressedSize and LZ4_DISTANCE_MAX, so it's possible to reduce memory
+ * requirements by playing with them.
  */
 
-#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
-#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
-
-#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
-#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)                          \
+   (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)                   \
+   ((decompressedSize) +                                                       \
+    LZ4_DECOMPRESS_INPLACE_MARGIN(                                             \
+       decompressedSize)) /**< note: presumes that compressedSize <            \
+                             decompressedSize. note2: margin is overestimated  \
+                             a bit, since it could use compressedSize instead  \
+                           */
+
+#ifndef LZ4_DISTANCE_MAX       /* history window size; can be user-defined at  \
+                                  compile time */
+#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
 #endif
 
-#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
-#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
-
-#endif   /* LZ4_STATIC_3504398509 */
-#endif   /* LZ4_STATIC_LINKING_ONLY */
-
+#define LZ4_COMPRESS_INPLACE_MARGIN                                            \
+   (LZ4_DISTANCE_MAX + 32) /* LZ4_DISTANCE_MAX can be safely replaced by       \
+                              srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)                    \
+   ((maxCompressedSize) +                                                      \
+    LZ4_COMPRESS_INPLACE_MARGIN) /**< maxCompressedSize is generally           \
+                                    LZ4_COMPRESSBOUND(inputSize), but can be   \
+                                    set to any lower value, with the risk that \
+                                    compression can fail (return code 0(zero)) \
+                                  */
 
+#endif /* LZ4_STATIC_3504398509 */
+#endif /* LZ4_STATIC_LINKING_ONLY */
 
 #ifndef LZ4_H_98237428734687
 #define LZ4_H_98237428734687
@@ -693,50 +800,54 @@ int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int*
  *  Private Definitions
  **************************************************************
  * Do not use these definitions directly.
- * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
- * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and
+ *`LZ4_streamDecode_t`. Accessing members will expose user code to API and/or
+ *ABI break in future versions of the library.
  **************************************************************/
-#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHLOG       (LZ4_MEMORY_USAGE - 2)
 #define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
-#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
-
-#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-# include <stdint.h>
-  typedef int8_t         LZ4_i8;
-  typedef unsigned char  LZ4_byte;
-  typedef uint16_t       LZ4_u16;
-  typedef uint32_t       LZ4_u32;
+#define LZ4_HASH_SIZE_U32                                                      \
+   (1 << LZ4_HASHLOG) /* required as macro for static allocation */
+
+#if defined(__cplusplus) ||                                                    \
+   (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#include <stdint.h>
+typedef int8_t        LZ4_i8;
+typedef unsigned char LZ4_byte;
+typedef uint16_t      LZ4_u16;
+typedef uint32_t      LZ4_u32;
 #else
-  typedef   signed char  LZ4_i8;
-  typedef unsigned char  LZ4_byte;
-  typedef unsigned short LZ4_u16;
-  typedef unsigned int   LZ4_u32;
+typedef signed char    LZ4_i8;
+typedef unsigned char  LZ4_byte;
+typedef unsigned short LZ4_u16;
+typedef unsigned int   LZ4_u32;
 #endif
 
 /*! LZ4_stream_t :
  *  Never ever use below internal definitions directly !
  *  These definitions are not API/ABI safe, and may change in future versions.
  *  If you need static allocation, declare or allocate an LZ4_stream_t object.
-**/
+ **/
 
 typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
 struct LZ4_stream_t_internal {
-    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
-    const LZ4_byte* dictionary;
-    const LZ4_stream_t_internal* dictCtx;
-    LZ4_u32 currentOffset;
-    LZ4_u32 tableType;
-    LZ4_u32 dictSize;
-    /* Implicit padding to ensure structure is aligned */
+   LZ4_u32                      hashTable[LZ4_HASH_SIZE_U32];
+   const LZ4_byte*              dictionary;
+   const LZ4_stream_t_internal* dictCtx;
+   LZ4_u32                      currentOffset;
+   LZ4_u32                      tableType;
+   LZ4_u32                      dictSize;
+   /* Implicit padding to ensure structure is aligned */
 };
 
-#define LZ4_STREAM_MINSIZE  ((1UL << (LZ4_MEMORY_USAGE)) + 32)  /* static size, for inter-version compatibility */
+#define LZ4_STREAM_MINSIZE                                                     \
+   ((1UL << (LZ4_MEMORY_USAGE)) +                                              \
+    32) /* static size, for inter-version compatibility */
 union LZ4_stream_u {
-    char minStateSize[LZ4_STREAM_MINSIZE];
-    LZ4_stream_t_internal internal_donotuse;
+   char                  minStateSize[LZ4_STREAM_MINSIZE];
+   LZ4_stream_t_internal internal_donotuse;
 }; /* previously typedef'd to LZ4_stream_t */
 
-
 /*! LZ4_initStream() : v1.9.0+
  *  An LZ4_stream_t structure must be initialized at least once.
  *  This is automatically done when invoking LZ4_createStream(),
@@ -746,77 +857,105 @@ union LZ4_stream_u {
  *  It can also initialize any arbitrary buffer of sufficient size,
  *  and will @return a pointer of proper type upon initialization.
  *
- *  Note : initialization fails if size and alignment conditions are not respected.
- *         In which case, the function will @return NULL.
- *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
- *  Note3: Before v1.9.0, use LZ4_resetStream() instead
-**/
-LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* stateBuffer, size_t size);
-
+ *  Note : initialization fails if size and alignment conditions are not
+ *respected. In which case, the function will @return NULL. Note2: An
+ *LZ4_stream_t structure guarantees correct alignment and size. Note3: Before
+ *v1.9.0, use LZ4_resetStream() instead
+ **/
+LZ4LIB_API LZ4_stream_t* LZ4_initStream(void* stateBuffer, size_t size);
 
 /*! LZ4_streamDecode_t :
  *  Never ever use below internal definitions directly !
  *  These definitions are not API/ABI safe, and may change in future versions.
- *  If you need static allocation, declare or allocate an LZ4_streamDecode_t object.
-**/
+ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t
+ *object.
+ **/
 typedef struct {
-    const LZ4_byte* externalDict;
-    const LZ4_byte* prefixEnd;
-    size_t extDictSize;
-    size_t prefixSize;
+   const LZ4_byte* externalDict;
+   const LZ4_byte* prefixEnd;
+   size_t          extDictSize;
+   size_t          prefixSize;
 } LZ4_streamDecode_t_internal;
 
 #define LZ4_STREAMDECODE_MINSIZE 32
 union LZ4_streamDecode_u {
-    char minStateSize[LZ4_STREAMDECODE_MINSIZE];
-    LZ4_streamDecode_t_internal internal_donotuse;
-} ;   /* previously typedef'd to LZ4_streamDecode_t */
-
-
+   char                        minStateSize[LZ4_STREAMDECODE_MINSIZE];
+   LZ4_streamDecode_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_streamDecode_t */
 
 /*-************************************
-*  Obsolete Functions
-**************************************/
+ *  Obsolete Functions
+ **************************************/
 
 /*! Deprecation warnings
  *
  *  Deprecated functions make the compiler generate a warning when invoked.
  *  This is meant to invite users to update their source code.
- *  Should deprecation warnings be a problem, it is generally possible to disable them,
- *  typically with -Wno-deprecated-declarations for gcc
- *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *  Should deprecation warnings be a problem, it is generally possible to
+ * disable them, typically with -Wno-deprecated-declarations for gcc or
+ * _CRT_SECURE_NO_WARNINGS in Visual.
  *
  *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
  *  before including the header file.
  */
 #ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
-#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#define LZ4_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#if defined(__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#elif defined(_MSC_VER)
+#define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__clang__) ||                                                    \
+   (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#define LZ4_DEPRECATED(message) __attribute__((deprecated))
 #else
-#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
-#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
-#  elif defined(_MSC_VER)
-#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
-#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
-#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
-#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
-#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
-#  else
-#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
-#    define LZ4_DEPRECATED(message)   /* disabled */
-#  endif
+#pragma message(                                                               \
+   "WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#define LZ4_DEPRECATED(message) /* disabled */
+#endif
 #endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
 
 /*! Obsolete compression functions (since v1.7.3) */
-LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
-LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
-LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
-LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
-LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
-LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")
+LZ4LIB_API int LZ4_compress(const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")
+LZ4LIB_API int LZ4_compress_limitedOutput(const char* src,
+                                          char*       dest,
+                                          int         srcSize,
+                                          int         maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead")
+LZ4LIB_API int LZ4_compress_withState(void*       state,
+                                      const char* source,
+                                      char*       dest,
+                                      int         inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead")
+LZ4LIB_API int LZ4_compress_limitedOutput_withState(void*       state,
+                                                    const char* source,
+                                                    char*       dest,
+                                                    int         inputSize,
+                                                    int         maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead")
+LZ4LIB_API int LZ4_compress_continue(LZ4_stream_t* LZ4_streamPtr,
+                                     const char*   source,
+                                     char*         dest,
+                                     int           inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead")
+LZ4LIB_API int LZ4_compress_limitedOutput_continue(LZ4_stream_t* LZ4_streamPtr,
+                                                   const char*   source,
+                                                   char*         dest,
+                                                   int           inputSize,
+                                                   int           maxOutputSize);
 
 /*! Obsolete decompression functions (since v1.8.0) */
-LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
-LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead")
+LZ4LIB_API int LZ4_uncompress(const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_uncompress_unknownOutputSize(const char* source,
+                                                char*       dest,
+                                                int         isize,
+                                                int         maxOutputSize);
 
 /* Obsolete streaming functions (since v1.7.0)
  * degraded functionality; do not use!
@@ -828,22 +967,34 @@ LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompres
  * achieved will therefore be no better than compressing each chunk
  * independently.
  */
-LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
-LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
-LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
-LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+LZ4_DEPRECATED("Use LZ4_createStream() instead")
+LZ4LIB_API void* LZ4_create(char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead")
+LZ4LIB_API int LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")
+LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")
+LZ4LIB_API char* LZ4_slideInputBuffer(void* state);
 
 /*! Obsolete streaming decoding functions (since v1.7.0) */
-LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
-LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_safe_withPrefix64k(const char* src,
+                                                 char*       dst,
+                                                 int         compressedSize,
+                                                 int         maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead")
+LZ4LIB_API int
+LZ4_decompress_fast_withPrefix64k(const char* src, char* dst, int originalSize);
 
 /*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
  *  These functions used to be faster than LZ4_decompress_safe(),
  *  but this is no longer the case. They are now slower.
  *  This is because LZ4_decompress_fast() doesn't know the input size,
- *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
- *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
- *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *  and therefore must progress more cautiously into the input buffer to not
+ * read beyond the end of block. On top of that `LZ4_decompress_fast()` is not
+ * protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and
+ * deprecated.
  *
  *  The last remaining LZ4_decompress_fast() specificity is that
  *  it can decompress a block without knowing its compressed size.
@@ -852,24 +1003,40 @@ LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4
  *
  *  Parameters:
  *  originalSize : is the uncompressed size to regenerate.
- *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ *                 `dst` must be already allocated, its size must be >=
+ * 'originalSize' bytes.
  * @return : number of bytes read from source buffer (== compressed size).
  *           The function expects to finish at block's end exactly.
- *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
- *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
- *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
- *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
- *         These issues never happen if input (compressed) data is correct.
- *         But they may happen if input data is invalid (error or intentional tampering).
- *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ *           If the source stream is detected malformed, the function stops
+ * decoding and returns a negative result. note : LZ4_decompress_fast*()
+ * requires originalSize. Thanks to this information, it never writes past the
+ * output buffer. However, since it doesn't know its 'src' size, it may read an
+ * unknown amount of input, past input buffer bounds. Also, since match offsets
+ * are not validated, match reads from 'src' may underflow too. These issues
+ * never happen if input (compressed) data is correct. But they may happen if
+ * input data is invalid (error or intentional tampering). As a consequence, use
+ * these functions in trusted environments with trusted data **only**.
  */
-LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial() instead")
-LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
-LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating towards LZ4_decompress_safe_continue() instead. "
-               "Note that the contract will change (requires block's compressed size, instead of decompressed size)")
-LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
-LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial_usingDict() instead")
-LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using "
+               "LZ4_decompress_safe_partial() instead")
+LZ4LIB_API int
+LZ4_decompress_fast(const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating "
+               "towards LZ4_decompress_safe_continue() instead. "
+               "Note that the contract will change (requires block's "
+               "compressed size, instead of decompressed size)")
+LZ4LIB_API int
+LZ4_decompress_fast_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                             const char*         src,
+                             char*               dst,
+                             int                 originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using "
+               "LZ4_decompress_safe_partial_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict(const char* src,
+                                             char*       dst,
+                                             int         originalSize,
+                                             const char* dictStart,
+                                             int         dictSize);
 
 /*! LZ4_resetStream() :
  *  An LZ4_stream_t structure must be initialized at least once.
@@ -877,12 +1044,10 @@ LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int or
  *  Consider switching to LZ4_initStream(),
  *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
  */
-LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
-
+LZ4LIB_API void LZ4_resetStream(LZ4_stream_t* streamPtr);
 
 #endif /* LZ4_H_98237428734687 */
 
-
-#if defined (__cplusplus)
+#if defined(__cplusplus)
 }
 #endif
diff --git a/tracegrind/main.c b/tracegrind/main.c
index fb1345fb2..91d2b9498 100644
--- a/tracegrind/main.c
+++ b/tracegrind/main.c
@@ -30,12 +30,12 @@
 */
 
 #include "config.h"
-#include "tracegrind.h"
 #include "global.h"
+#include "tracegrind.h"
 
-#include "pub_tool_threadstate.h"
 #include "pub_tool_gdbserver.h"
-#include "pub_tool_transtab.h"       // VG_(discard_translations_safely)
+#include "pub_tool_threadstate.h"
+#include "pub_tool_transtab.h" // VG_(discard_translations_safely)
 
 #include "cg_branchpred.c"
 
@@ -45,8 +45,8 @@
 
 /* for all threads */
 CommandLineOptions TG_(clo);
-Statistics TG_(stat);
-Bool TG_(instrument_state) = True; /* Instrumentation on ? */
+Statistics         TG_(stat);
+Bool               TG_(instrument_state) = True; /* Instrumentation on ? */
 
 /* thread and signal handler specific */
 exec_state TG_(current_state);
@@ -55,51 +55,49 @@ exec_state TG_(current_state);
    non-zero value if we are doing cache simulation. */
 Int TG_(min_line_size) = 0;
 
-
 /*------------------------------------------------------------*/
 /*--- Statistics                                           ---*/
 /*------------------------------------------------------------*/
 
 static void TG_(init_statistics)(Statistics* s)
 {
-  s->call_counter        = 0;
-  s->jcnd_counter        = 0;
-  s->jump_counter        = 0;
-  s->rec_call_counter    = 0;
-  s->ret_counter         = 0;
-  s->bb_executions       = 0;
-
-  s->context_counter     = 0;
-  s->bb_retranslations   = 0;
-
-  s->distinct_objs       = 0;
-  s->distinct_files      = 0;
-  s->distinct_fns        = 0;
-  s->distinct_contexts   = 0;
-  s->distinct_bbs        = 0;
-  s->distinct_bbccs      = 0;
-  s->distinct_instrs     = 0;
-  s->distinct_skips      = 0;
-
-  s->bb_hash_resizes     = 0;
-  s->bbcc_hash_resizes   = 0;
-  s->jcc_hash_resizes    = 0;
-  s->cxt_hash_resizes    = 0;
-  s->fn_array_resizes    = 0;
-  s->call_stack_resizes  = 0;
-  s->fn_stack_resizes    = 0;
-
-  s->full_debug_BBs      = 0;
-  s->file_line_debug_BBs = 0;
-  s->fn_name_debug_BBs   = 0;
-  s->no_debug_BBs        = 0;
-  s->bbcc_lru_misses     = 0;
-  s->jcc_lru_misses      = 0;
-  s->cxt_lru_misses      = 0;
-  s->bbcc_clones         = 0;
+   s->call_counter     = 0;
+   s->jcnd_counter     = 0;
+   s->jump_counter     = 0;
+   s->rec_call_counter = 0;
+   s->ret_counter      = 0;
+   s->bb_executions    = 0;
+
+   s->context_counter   = 0;
+   s->bb_retranslations = 0;
+
+   s->distinct_objs     = 0;
+   s->distinct_files    = 0;
+   s->distinct_fns      = 0;
+   s->distinct_contexts = 0;
+   s->distinct_bbs      = 0;
+   s->distinct_bbccs    = 0;
+   s->distinct_instrs   = 0;
+   s->distinct_skips    = 0;
+
+   s->bb_hash_resizes    = 0;
+   s->bbcc_hash_resizes  = 0;
+   s->jcc_hash_resizes   = 0;
+   s->cxt_hash_resizes   = 0;
+   s->fn_array_resizes   = 0;
+   s->call_stack_resizes = 0;
+   s->fn_stack_resizes   = 0;
+
+   s->full_debug_BBs      = 0;
+   s->file_line_debug_BBs = 0;
+   s->fn_name_debug_BBs   = 0;
+   s->no_debug_BBs        = 0;
+   s->bbcc_lru_misses     = 0;
+   s->jcc_lru_misses      = 0;
+   s->cxt_lru_misses      = 0;
+   s->bbcc_clones         = 0;
 }
 
-
 /*------------------------------------------------------------*/
 /*--- Simple callbacks (not cache similator)               ---*/
 /*------------------------------------------------------------*/
@@ -107,88 +105,90 @@ static void TG_(init_statistics)(Statistics* s)
 VG_REGPARM(1)
 static void log_global_event(InstrInfo* ii)
 {
-    ULong* cost_Bus;
+   ULong* cost_Bus;
 
-    TG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
-              TG_(bb_base) + ii->instr_offset, ii->instr_size);
+   TG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size);
 
-    if (!TG_(current_state).collect) return;
+   if (!TG_(current_state).collect)
+      return;
 
-    TG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
+   TG_ASSERT((ii->eventset->mask & (1u << EG_BUS)) > 0);
 
-    TG_(current_state).cost[ fullOffset(EG_BUS) ]++;
+   TG_(current_state).cost[fullOffset(EG_BUS)]++;
 
-    if (TG_(current_state).nonskipped)
-        cost_Bus = TG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
-    else
-        cost_Bus = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
-    cost_Bus[0]++;
+   if (TG_(current_state).nonskipped)
+      cost_Bus = TG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
+   else
+      cost_Bus =
+         TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
+   cost_Bus[0]++;
 }
 
-
 /* For branches, we consult two different predictors, one which
    predicts taken/untaken for conditional branches, and the other
    which predicts the branch target address for indirect branches
    (jump-to-register style ones). */
 
-static VG_REGPARM(2)
-void log_cond_branch(InstrInfo* ii, Word taken)
+static VG_REGPARM(2) void log_cond_branch(InstrInfo* ii, Word taken)
 {
-    Bool miss;
-    Int fullOffset_Bc;
-    ULong* cost_Bc;
+   Bool   miss;
+   Int    fullOffset_Bc;
+   ULong* cost_Bc;
 
-    TG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %ld\n",
-              TG_(bb_base) + ii->instr_offset, taken);
+   TG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %ld\n",
+            TG_(bb_base) + ii->instr_offset, taken);
 
-    miss = 1 & do_cond_branch_predict(TG_(bb_base) + ii->instr_offset, taken);
+   miss = 1 & do_cond_branch_predict(TG_(bb_base) + ii->instr_offset, taken);
 
-    if (!TG_(current_state).collect) return;
+   if (!TG_(current_state).collect)
+      return;
 
-    TG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
+   TG_ASSERT((ii->eventset->mask & (1u << EG_BC)) > 0);
 
-    if (TG_(current_state).nonskipped)
-        cost_Bc = TG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
-    else
-        cost_Bc = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
+   if (TG_(current_state).nonskipped)
+      cost_Bc = TG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
+   else
+      cost_Bc = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
 
-    fullOffset_Bc = fullOffset(EG_BC);
-    TG_(current_state).cost[ fullOffset_Bc ]++;
-    cost_Bc[0]++;
-    if (miss) {
-        TG_(current_state).cost[ fullOffset_Bc+1 ]++;
-        cost_Bc[1]++;
-    }
+   fullOffset_Bc = fullOffset(EG_BC);
+   TG_(current_state).cost[fullOffset_Bc]++;
+   cost_Bc[0]++;
+   if (miss) {
+      TG_(current_state).cost[fullOffset_Bc + 1]++;
+      cost_Bc[1]++;
+   }
 }
 
-static VG_REGPARM(2)
-void log_ind_branch(InstrInfo* ii, UWord actual_dst)
+static VG_REGPARM(2) void log_ind_branch(InstrInfo* ii, UWord actual_dst)
 {
-    Bool miss;
-    Int fullOffset_Bi;
-    ULong* cost_Bi;
+   Bool   miss;
+   Int    fullOffset_Bi;
+   ULong* cost_Bi;
 
-    TG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
-              TG_(bb_base) + ii->instr_offset, actual_dst);
+   TG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
+            TG_(bb_base) + ii->instr_offset, actual_dst);
 
-    miss = 1 & do_ind_branch_predict(TG_(bb_base) + ii->instr_offset, actual_dst);
+   miss =
+      1 & do_ind_branch_predict(TG_(bb_base) + ii->instr_offset, actual_dst);
 
-    if (!TG_(current_state).collect) return;
+   if (!TG_(current_state).collect)
+      return;
 
-    TG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
+   TG_ASSERT((ii->eventset->mask & (1u << EG_BI)) > 0);
 
-    if (TG_(current_state).nonskipped)
-        cost_Bi = TG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
-    else
-        cost_Bi = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
+   if (TG_(current_state).nonskipped)
+      cost_Bi = TG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
+   else
+      cost_Bi = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
 
-    fullOffset_Bi = fullOffset(EG_BI);
-    TG_(current_state).cost[ fullOffset_Bi ]++;
-    cost_Bi[0]++;
-    if (miss) {
-        TG_(current_state).cost[ fullOffset_Bi+1 ]++;
-        cost_Bi[1]++;
-    }
+   fullOffset_Bi = fullOffset(EG_BI);
+   TG_(current_state).cost[fullOffset_Bi]++;
+   cost_Bi[0]++;
+   if (miss) {
+      TG_(current_state).cost[fullOffset_Bi + 1]++;
+      cost_Bi[1]++;
+   }
 }
 
 /*------------------------------------------------------------*/
@@ -224,148 +224,147 @@ void log_ind_branch(InstrInfo* ii, UWord actual_dst)
    For example, it could well be profitable to handle two adjacent Ir
    events with a single helper call.  */
 
-typedef
-   IRExpr
-   IRAtom;
-
-typedef
-   enum {
-      Ev_Ir,  // Instruction read
-      Ev_Dr,  // Data read
-      Ev_Dw,  // Data write
-      Ev_Dm,  // Data modify (read then write)
-      Ev_Bc,  // branch conditional
-      Ev_Bi,  // branch indirect (to unknown destination)
-      Ev_G    // Global bus event
-   }
-   EventTag;
-
-typedef
-   struct {
-      EventTag   tag;
-      InstrInfo* inode;
-      union {
-	 struct {
-	 } Ir;
-	 struct {
-	    IRAtom* ea;
-	    Int     szB;
-	 } Dr;
-	 struct {
-	    IRAtom* ea;
-	    Int     szB;
-	 } Dw;
-	 struct {
-	    IRAtom* ea;
-	    Int     szB;
-	 } Dm;
-         struct {
-            IRAtom* taken; /* :: Ity_I1 */
-         } Bc;
-         struct {
-            IRAtom* dst;
-         } Bi;
-	 struct {
-	 } G;
-      } Ev;
-   }
-   Event;
+typedef IRExpr IRAtom;
 
-static void init_Event ( Event* ev ) {
-   VG_(memset)(ev, 0, sizeof(Event));
-}
+typedef enum {
+   Ev_Ir, // Instruction read
+   Ev_Dr, // Data read
+   Ev_Dw, // Data write
+   Ev_Dm, // Data modify (read then write)
+   Ev_Bc, // branch conditional
+   Ev_Bi, // branch indirect (to unknown destination)
+   Ev_G   // Global bus event
+} EventTag;
 
-static IRAtom* get_Event_dea ( Event* ev ) {
+typedef struct {
+   EventTag   tag;
+   InstrInfo* inode;
+   union {
+      struct {
+      } Ir;
+      struct {
+         IRAtom* ea;
+         Int     szB;
+      } Dr;
+      struct {
+         IRAtom* ea;
+         Int     szB;
+      } Dw;
+      struct {
+         IRAtom* ea;
+         Int     szB;
+      } Dm;
+      struct {
+         IRAtom* taken; /* :: Ity_I1 */
+      } Bc;
+      struct {
+         IRAtom* dst;
+      } Bi;
+      struct {
+      } G;
+   } Ev;
+} Event;
+
+static void init_Event(Event* ev) { VG_(memset)(ev, 0, sizeof(Event)); }
+
+static IRAtom* get_Event_dea(Event* ev)
+{
    switch (ev->tag) {
-      case Ev_Dr: return ev->Ev.Dr.ea;
-      case Ev_Dw: return ev->Ev.Dw.ea;
-      case Ev_Dm: return ev->Ev.Dm.ea;
-      default:    tl_assert(0);
+   case Ev_Dr:
+      return ev->Ev.Dr.ea;
+   case Ev_Dw:
+      return ev->Ev.Dw.ea;
+   case Ev_Dm:
+      return ev->Ev.Dm.ea;
+   default:
+      tl_assert(0);
    }
 }
 
-static Int get_Event_dszB ( Event* ev ) {
+static Int get_Event_dszB(Event* ev)
+{
    switch (ev->tag) {
-      case Ev_Dr: return ev->Ev.Dr.szB;
-      case Ev_Dw: return ev->Ev.Dw.szB;
-      case Ev_Dm: return ev->Ev.Dm.szB;
-      default:    tl_assert(0);
+   case Ev_Dr:
+      return ev->Ev.Dr.szB;
+   case Ev_Dw:
+      return ev->Ev.Dw.szB;
+   case Ev_Dm:
+      return ev->Ev.Dm.szB;
+   default:
+      tl_assert(0);
    }
 }
 
-
 /* Up to this many unnotified events are allowed.  Number is
    arbitrary.  Larger numbers allow more event merging to occur, but
    potentially induce more spilling due to extending live ranges of
    address temporaries. */
 #define N_EVENTS 16
 
-
 /* A struct which holds all the running state during instrumentation.
    Mostly to avoid passing loads of parameters everywhere. */
 typedef struct {
-    /* The current outstanding-memory-event list. */
-    Event events[N_EVENTS];
-    Int   events_used;
+   /* The current outstanding-memory-event list. */
+   Event events[N_EVENTS];
+   Int   events_used;
 
-    /* The array of InstrInfo's is part of BB struct. */
-    BB* bb;
+   /* The array of InstrInfo's is part of BB struct. */
+   BB* bb;
 
-    /* BB seen before (ie. re-instrumentation) */
-    Bool seen_before;
+   /* BB seen before (ie. re-instrumentation) */
+   Bool seen_before;
 
-    /* Number InstrInfo bins 'used' so far. */
-    UInt ii_index;
+   /* Number InstrInfo bins 'used' so far. */
+   UInt ii_index;
 
-    // current offset of guest instructions from BB start
-    UInt instr_offset;
+   // current offset of guest instructions from BB start
+   UInt instr_offset;
 
-    /* The output SB being constructed. */
-    IRSB* sbOut;
+   /* The output SB being constructed. */
+   IRSB* sbOut;
 } ClgState;
 
-
-static void showEvent ( Event* ev )
+static void showEvent(Event* ev)
 {
    switch (ev->tag) {
-      case Ev_Ir:
-	 VG_(printf)("Ir (InstrInfo %p) at +%u\n",
-		     ev->inode, ev->inode->instr_offset);
-	 break;
-      case Ev_Dr:
-	 VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=",
-		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
-	 ppIRExpr(ev->Ev.Dr.ea);
-	 VG_(printf)("\n");
-	 break;
-      case Ev_Dw:
-	 VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=",
-		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
-	 ppIRExpr(ev->Ev.Dw.ea);
-	 VG_(printf)("\n");
-	 break;
-      case Ev_Dm:
-	 VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=",
-		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
-	 ppIRExpr(ev->Ev.Dm.ea);
-	 VG_(printf)("\n");
-	 break;
-      case Ev_Bc:
-         VG_(printf)("Bc %p   GA=", ev->inode);
-         ppIRExpr(ev->Ev.Bc.taken);
-         VG_(printf)("\n");
-         break;
-      case Ev_Bi:
-         VG_(printf)("Bi %p  DST=", ev->inode);
-         ppIRExpr(ev->Ev.Bi.dst);
-         VG_(printf)("\n");
-         break;
-      case Ev_G:
-         VG_(printf)("G  %p\n", ev->inode);
-         break;
-      default:
-	 tl_assert(0);
-	 break;
+   case Ev_Ir:
+      VG_(printf)("Ir (InstrInfo %p) at +%u\n", ev->inode,
+                  ev->inode->instr_offset);
+      break;
+   case Ev_Dr:
+      VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=", ev->inode,
+                  ev->inode->instr_offset, ev->Ev.Dr.szB);
+      ppIRExpr(ev->Ev.Dr.ea);
+      VG_(printf)("\n");
+      break;
+   case Ev_Dw:
+      VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=", ev->inode,
+                  ev->inode->instr_offset, ev->Ev.Dw.szB);
+      ppIRExpr(ev->Ev.Dw.ea);
+      VG_(printf)("\n");
+      break;
+   case Ev_Dm:
+      VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=", ev->inode,
+                  ev->inode->instr_offset, ev->Ev.Dm.szB);
+      ppIRExpr(ev->Ev.Dm.ea);
+      VG_(printf)("\n");
+      break;
+   case Ev_Bc:
+      VG_(printf)("Bc %p   GA=", ev->inode);
+      ppIRExpr(ev->Ev.Bc.taken);
+      VG_(printf)("\n");
+      break;
+   case Ev_Bi:
+      VG_(printf)("Bi %p  DST=", ev->inode);
+      ppIRExpr(ev->Ev.Bi.dst);
+      VG_(printf)("\n");
+      break;
+   case Ev_G:
+      VG_(printf)("G  %p\n", ev->inode);
+      break;
+   default:
+      tl_assert(0);
+      break;
    }
 }
 
@@ -373,62 +372,62 @@ static void showEvent ( Event* ev )
    empty.  Code is generated into cgs->sbOut, and this activity
    'consumes' slots in cgs->bb. */
 
-static void flushEvents ( ClgState* clgs )
+static void flushEvents(ClgState* clgs)
 {
-   Int        i, regparms, inew;
+   Int          i, regparms, inew;
    const HChar* helperName;
-   void*      helperAddr;
-   IRExpr**   argv;
-   IRExpr*    i_node_expr;
-   IRDirty*   di;
-   Event*     ev;
-   Event*     ev2;
-   Event*     ev3;
+   void*        helperAddr;
+   IRExpr**     argv;
+   IRExpr*      i_node_expr;
+   IRDirty*     di;
+   Event*       ev;
+   Event*       ev2;
+   Event*       ev3;
 
    if (!clgs->seen_before) {
-       // extend event sets as needed
-       // available sets: D0 Dr
-       for(i=0; i<clgs->events_used; i++) {
-	   ev  = &clgs->events[i];
-	   switch(ev->tag) {
-	   case Ev_Ir:
-	       // Ir event always is first for a guest instruction
-	       TG_ASSERT(ev->inode->eventset == 0);
-	       ev->inode->eventset = TG_(sets).base;
-	       break;
-	   case Ev_Dr:
-               // extend event set by Dr counters
-	       ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
-							   EG_DR);
-	       break;
-	   case Ev_Dw:
-	   case Ev_Dm:
-               // extend event set by Dw counters
-	       ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
-							   EG_DW);
-	       break;
-           case Ev_Bc:
-               // extend event set by Bc counters
-               ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
-                                                           EG_BC);
-               break;
-           case Ev_Bi:
-               // extend event set by Bi counters
-               ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
-                                                           EG_BI);
-               break;
-	   case Ev_G:
-               // extend event set by Bus counter
-	       ev->inode->eventset = TG_(add_event_group)(ev->inode->eventset,
-							   EG_BUS);
-	       break;
-	   default:
-	       tl_assert(0);
-	   }
-       }
+      // extend event sets as needed
+      // available sets: D0 Dr
+      for (i = 0; i < clgs->events_used; i++) {
+         ev = &clgs->events[i];
+         switch (ev->tag) {
+         case Ev_Ir:
+            // Ir event always is first for a guest instruction
+            TG_ASSERT(ev->inode->eventset == 0);
+            ev->inode->eventset = TG_(sets).base;
+            break;
+         case Ev_Dr:
+            // extend event set by Dr counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_DR);
+            break;
+         case Ev_Dw:
+         case Ev_Dm:
+            // extend event set by Dw counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_DW);
+            break;
+         case Ev_Bc:
+            // extend event set by Bc counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_BC);
+            break;
+         case Ev_Bi:
+            // extend event set by Bi counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_BI);
+            break;
+         case Ev_G:
+            // extend event set by Bus counter
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_BUS);
+            break;
+         default:
+            tl_assert(0);
+         }
+      }
    }
 
-   for(i = 0; i < clgs->events_used; i = inew) {
+   for (i = 0; i < clgs->events_used; i = inew) {
 
       helperName = NULL;
       helperAddr = NULL;
@@ -436,188 +435,183 @@ static void flushEvents ( ClgState* clgs )
       regparms   = 0;
 
       /* generate IR to notify event i and possibly the ones
-	 immediately following it. */
+         immediately following it. */
       tl_assert(i >= 0 && i < clgs->events_used);
 
       ev  = &clgs->events[i];
-      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
-      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
+      ev2 = (i < clgs->events_used - 1 ? &clgs->events[i + 1] : NULL);
+      ev3 = (i < clgs->events_used - 2 ? &clgs->events[i + 2] : NULL);
 
-      TG_DEBUGIF(5) {
-	 VG_(printf)("   flush ");
-	 showEvent( ev );
+      TG_DEBUGIF(5)
+      {
+         VG_(printf)("   flush ");
+         showEvent(ev);
       }
 
-      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
+      i_node_expr = mkIRExpr_HWord((HWord)ev->inode);
 
       /* Decide on helper fn to call and args to pass it, and advance
-	 i appropriately.
-	 Dm events have same effect as Dw events */
+         i appropriately.
+         Dm events have same effect as Dw events */
       switch (ev->tag) {
-	 case Ev_Ir:
-	    /* Merge an Ir with a following Dr. */
-	    if (ev2 && ev2->tag == Ev_Dr) {
-	       /* Why is this true?  It's because we're merging an Ir
-		  with a following Dr.  The Ir derives from the
-		  instruction's IMark and the Dr from data
-		  references which follow it.  In short it holds
-		  because each insn starts with an IMark, hence an
-		  Ev_Ir, and so these Dr must pertain to the
-		  immediately preceding Ir.  Same applies to analogous
-		  assertions in the subsequent cases. */
-	       tl_assert(ev2->inode == ev->inode);
-	       helperName = TG_(cachesim).log_1I1Dr_name;
-	       helperAddr = TG_(cachesim).log_1I1Dr;
-	       argv = mkIRExprVec_3( i_node_expr,
-				     get_Event_dea(ev2),
-				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
-	       regparms = 3;
-	       inew = i+2;
-	    }
-	    /* Merge an Ir with a following Dw/Dm. */
-	    else
-	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
-	       tl_assert(ev2->inode == ev->inode);
-	       helperName = TG_(cachesim).log_1I1Dw_name;
-	       helperAddr = TG_(cachesim).log_1I1Dw;
-	       argv = mkIRExprVec_3( i_node_expr,
-				     get_Event_dea(ev2),
-				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
-	       regparms = 3;
-	       inew = i+2;
-	    }
-	    /* Merge an Ir with two following Irs. */
-	    else
-	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
-	       helperName = TG_(cachesim).log_3I0D_name;
-	       helperAddr = TG_(cachesim).log_3I0D;
-	       argv = mkIRExprVec_3( i_node_expr,
-				     mkIRExpr_HWord( (HWord)ev2->inode ),
-				     mkIRExpr_HWord( (HWord)ev3->inode ) );
-	       regparms = 3;
-	       inew = i+3;
-	    }
-	    /* Merge an Ir with one following Ir. */
-	    else
-	    if (ev2 && ev2->tag == Ev_Ir) {
-	       helperName = TG_(cachesim).log_2I0D_name;
-	       helperAddr = TG_(cachesim).log_2I0D;
-	       argv = mkIRExprVec_2( i_node_expr,
-				     mkIRExpr_HWord( (HWord)ev2->inode ) );
-	       regparms = 2;
-	       inew = i+2;
-	    }
-	    /* No merging possible; emit as-is. */
-	    else {
-	       helperName = TG_(cachesim).log_1I0D_name;
-	       helperAddr = TG_(cachesim).log_1I0D;
-	       argv = mkIRExprVec_1( i_node_expr );
-	       regparms = 1;
-	       inew = i+1;
-	    }
-	    break;
-	 case Ev_Dr:
-	    /* Data read or modify */
-	    helperName = TG_(cachesim).log_0I1Dr_name;
-	    helperAddr = TG_(cachesim).log_0I1Dr;
-	    argv = mkIRExprVec_3( i_node_expr,
-				  get_Event_dea(ev),
-				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
-	    regparms = 3;
-	    inew = i+1;
-	    break;
-	 case Ev_Dw:
-	 case Ev_Dm:
-	    /* Data write */
-	    helperName = TG_(cachesim).log_0I1Dw_name;
-	    helperAddr = TG_(cachesim).log_0I1Dw;
-	    argv = mkIRExprVec_3( i_node_expr,
-				  get_Event_dea(ev),
-				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
-	    regparms = 3;
-	    inew = i+1;
-	    break;
-         case Ev_Bc:
-            /* Conditional branch */
-            helperName = "log_cond_branch";
-            helperAddr = &log_cond_branch;
-            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
-            regparms = 2;
-            inew = i+1;
-            break;
-         case Ev_Bi:
-            /* Branch to an unknown destination */
-            helperName = "log_ind_branch";
-            helperAddr = &log_ind_branch;
-            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
+      case Ev_Ir:
+         /* Merge an Ir with a following Dr. */
+         if (ev2 && ev2->tag == Ev_Dr) {
+            /* Why is this true?  It's because we're merging an Ir
+               with a following Dr.  The Ir derives from the
+               instruction's IMark and the Dr from data
+               references which follow it.  In short it holds
+               because each insn starts with an IMark, hence an
+               Ev_Ir, and so these Dr must pertain to the
+               immediately preceding Ir.  Same applies to analogous
+               assertions in the subsequent cases. */
+            tl_assert(ev2->inode == ev->inode);
+            helperName = TG_(cachesim).log_1I1Dr_name;
+            helperAddr = TG_(cachesim).log_1I1Dr;
+            argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev2),
+                                       mkIRExpr_HWord(get_Event_dszB(ev2)));
+            regparms   = 3;
+            inew       = i + 2;
+         }
+         /* Merge an Ir with a following Dw/Dm. */
+         else if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
+            tl_assert(ev2->inode == ev->inode);
+            helperName = TG_(cachesim).log_1I1Dw_name;
+            helperAddr = TG_(cachesim).log_1I1Dw;
+            argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev2),
+                                       mkIRExpr_HWord(get_Event_dszB(ev2)));
+            regparms   = 3;
+            inew       = i + 2;
+         }
+         /* Merge an Ir with two following Irs. */
+         else if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
+            helperName = TG_(cachesim).log_3I0D_name;
+            helperAddr = TG_(cachesim).log_3I0D;
+            argv = mkIRExprVec_3(i_node_expr, mkIRExpr_HWord((HWord)ev2->inode),
+                                 mkIRExpr_HWord((HWord)ev3->inode));
+            regparms = 3;
+            inew     = i + 3;
+         }
+         /* Merge an Ir with one following Ir. */
+         else if (ev2 && ev2->tag == Ev_Ir) {
+            helperName = TG_(cachesim).log_2I0D_name;
+            helperAddr = TG_(cachesim).log_2I0D;
+            argv =
+               mkIRExprVec_2(i_node_expr, mkIRExpr_HWord((HWord)ev2->inode));
             regparms = 2;
-            inew = i+1;
-            break;
-         case Ev_G:
-            /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
-            helperName = "log_global_event";
-            helperAddr = &log_global_event;
-            argv = mkIRExprVec_1( i_node_expr );
-            regparms = 1;
-            inew = i+1;
-            break;
-	 default:
-	    tl_assert(0);
+            inew     = i + 2;
+         }
+         /* No merging possible; emit as-is. */
+         else {
+            helperName = TG_(cachesim).log_1I0D_name;
+            helperAddr = TG_(cachesim).log_1I0D;
+            argv       = mkIRExprVec_1(i_node_expr);
+            regparms   = 1;
+            inew       = i + 1;
+         }
+         break;
+      case Ev_Dr:
+         /* Data read or modify */
+         helperName = TG_(cachesim).log_0I1Dr_name;
+         helperAddr = TG_(cachesim).log_0I1Dr;
+         argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev),
+                                    mkIRExpr_HWord(get_Event_dszB(ev)));
+         regparms   = 3;
+         inew       = i + 1;
+         break;
+      case Ev_Dw:
+      case Ev_Dm:
+         /* Data write */
+         helperName = TG_(cachesim).log_0I1Dw_name;
+         helperAddr = TG_(cachesim).log_0I1Dw;
+         argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev),
+                                    mkIRExpr_HWord(get_Event_dszB(ev)));
+         regparms   = 3;
+         inew       = i + 1;
+         break;
+      case Ev_Bc:
+         /* Conditional branch */
+         helperName = "log_cond_branch";
+         helperAddr = &log_cond_branch;
+         argv       = mkIRExprVec_2(i_node_expr, ev->Ev.Bc.taken);
+         regparms   = 2;
+         inew       = i + 1;
+         break;
+      case Ev_Bi:
+         /* Branch to an unknown destination */
+         helperName = "log_ind_branch";
+         helperAddr = &log_ind_branch;
+         argv       = mkIRExprVec_2(i_node_expr, ev->Ev.Bi.dst);
+         regparms   = 2;
+         inew       = i + 1;
+         break;
+      case Ev_G:
+         /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
+         helperName = "log_global_event";
+         helperAddr = &log_global_event;
+         argv       = mkIRExprVec_1(i_node_expr);
+         regparms   = 1;
+         inew       = i + 1;
+         break;
+      default:
+         tl_assert(0);
       }
 
-      TG_DEBUGIF(5) {
-	  if (inew > i+1) {
-	      VG_(printf)("   merge ");
-	      showEvent( ev2 );
-	  }
-	  if (inew > i+2) {
-	      VG_(printf)("   merge ");
-	      showEvent( ev3 );
-	  }
-	  if (helperAddr)
-	      VG_(printf)("   call  %s (%p)\n",
-			  helperName, helperAddr);
+      TG_DEBUGIF(5)
+      {
+         if (inew > i + 1) {
+            VG_(printf)("   merge ");
+            showEvent(ev2);
+         }
+         if (inew > i + 2) {
+            VG_(printf)("   merge ");
+            showEvent(ev3);
+         }
+         if (helperAddr)
+            VG_(printf)("   call  %s (%p)\n", helperName, helperAddr);
       }
 
       /* helper could be unset depending on the simulator used */
-      if (helperAddr == 0) continue;
+      if (helperAddr == 0)
+         continue;
 
       /* Add the helper. */
       tl_assert(helperName);
       tl_assert(helperAddr);
       tl_assert(argv);
-      di = unsafeIRDirty_0_N( regparms,
-			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
-			      argv );
-      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+      di = unsafeIRDirty_0_N(regparms, helperName,
+                             VG_(fnptr_to_fnentry)(helperAddr), argv);
+      addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di));
    }
 
    clgs->events_used = 0;
 }
 
-static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
+static void addEvent_Ir(ClgState* clgs, InstrInfo* inode)
 {
    Event* evt;
    tl_assert(clgs->seen_before || (inode->eventset == 0));
-   if (!TG_(clo).simulate_cache) return;
+   if (!TG_(clo).simulate_cache)
+      return;
 
    if (clgs->events_used == N_EVENTS)
       flushEvents(clgs);
    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    evt = &clgs->events[clgs->events_used];
    init_Event(evt);
-   evt->tag      = Ev_Ir;
-   evt->inode    = inode;
+   evt->tag   = Ev_Ir;
+   evt->inode = inode;
    clgs->events_used++;
 }
 
-static
-void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
+static void
+addEvent_Dr(ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea)
 {
    Event* evt;
    tl_assert(isIRAtom(ea));
    tl_assert(datasize >= 1);
-   if (!TG_(clo).simulate_cache) return;
+   if (!TG_(clo).simulate_cache)
+      return;
    tl_assert(datasize <= TG_(min_line_size));
 
    if (clgs->events_used == N_EVENTS)
@@ -632,24 +626,22 @@ void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    clgs->events_used++;
 }
 
-static
-void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
+static void
+addEvent_Dw(ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea)
 {
    Event* evt;
    tl_assert(isIRAtom(ea));
    tl_assert(datasize >= 1);
-   if (!TG_(clo).simulate_cache) return;
+   if (!TG_(clo).simulate_cache)
+      return;
    tl_assert(datasize <= TG_(min_line_size));
 
    /* Is it possible to merge this write with the preceding read? */
    if (clgs->events_used > 0) {
-      Event* lastEvt = &clgs->events[clgs->events_used-1];
-      if (   lastEvt->tag       == Ev_Dr
-          && lastEvt->Ev.Dr.szB == datasize
-          && lastEvt->inode     == inode
-          && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
-      {
-         lastEvt->tag   = Ev_Dm;
+      Event* lastEvt = &clgs->events[clgs->events_used - 1];
+      if (lastEvt->tag == Ev_Dr && lastEvt->Ev.Dr.szB == datasize &&
+          lastEvt->inode == inode && eqIRAtom(lastEvt->Ev.Dr.ea, ea)) {
+         lastEvt->tag = Ev_Dm;
          return;
       }
    }
@@ -667,16 +659,19 @@ void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    clgs->events_used++;
 }
 
-static
-void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
-                          Int datasize, IRAtom* ea, IRAtom* guard,
-                          Bool isWrite )
+static void addEvent_D_guarded(ClgState*  clgs,
+                               InstrInfo* inode,
+                               Int        datasize,
+                               IRAtom*    ea,
+                               IRAtom*    guard,
+                               Bool       isWrite)
 {
    tl_assert(isIRAtom(ea));
    tl_assert(guard);
    tl_assert(isIRAtom(guard));
    tl_assert(datasize >= 1);
-   if (!TG_(clo).simulate_cache) return;
+   if (!TG_(clo).simulate_cache)
+      return;
    tl_assert(datasize <= TG_(min_line_size));
 
    /* Adding guarded memory actions and merging them with the existing
@@ -695,30 +690,26 @@ void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
    IRExpr**     argv;
    Int          regparms;
    IRDirty*     di;
-   i_node_expr = mkIRExpr_HWord( (HWord)inode );
-   helperName  = isWrite ? TG_(cachesim).log_0I1Dw_name
-                         : TG_(cachesim).log_0I1Dr_name;
-   helperAddr  = isWrite ? TG_(cachesim).log_0I1Dw
-                         : TG_(cachesim).log_0I1Dr;
-   argv        = mkIRExprVec_3( i_node_expr,
-                                ea, mkIRExpr_HWord( datasize ) );
-   regparms    = 3;
-   di          = unsafeIRDirty_0_N(
-                    regparms,
-                    helperName, VG_(fnptr_to_fnentry)( helperAddr ),
-                    argv );
-   di->guard = guard;
-   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+   i_node_expr = mkIRExpr_HWord((HWord)inode);
+   helperName =
+      isWrite ? TG_(cachesim).log_0I1Dw_name : TG_(cachesim).log_0I1Dr_name;
+   helperAddr = isWrite ? TG_(cachesim).log_0I1Dw : TG_(cachesim).log_0I1Dr;
+   argv       = mkIRExprVec_3(i_node_expr, ea, mkIRExpr_HWord(datasize));
+   regparms   = 3;
+   di         = unsafeIRDirty_0_N(regparms, helperName,
+                                  VG_(fnptr_to_fnentry)(helperAddr), argv);
+   di->guard  = guard;
+   addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di));
 }
 
-static
-void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
+static void addEvent_Bc(ClgState* clgs, InstrInfo* inode, IRAtom* guard)
 {
    Event* evt;
    tl_assert(isIRAtom(guard));
-   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
-             == (sizeof(RegWord)==4 ? Ity_I32 : Ity_I64));
-   if (!TG_(clo).simulate_branch) return;
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard) ==
+             (sizeof(RegWord) == 4 ? Ity_I32 : Ity_I64));
+   if (!TG_(clo).simulate_branch)
+      return;
 
    if (clgs->events_used == N_EVENTS)
       flushEvents(clgs);
@@ -731,14 +722,14 @@ void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
    clgs->events_used++;
 }
 
-static
-void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
+static void addEvent_Bi(ClgState* clgs, InstrInfo* inode, IRAtom* whereTo)
 {
    Event* evt;
    tl_assert(isIRAtom(whereTo));
-   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
-             == (sizeof(RegWord)==4 ? Ity_I32 : Ity_I64));
-   if (!TG_(clo).simulate_branch) return;
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo) ==
+             (sizeof(RegWord) == 4 ? Ity_I32 : Ity_I64));
+   if (!TG_(clo).simulate_branch)
+      return;
 
    if (clgs->events_used == N_EVENTS)
       flushEvents(clgs);
@@ -751,19 +742,19 @@ void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
    clgs->events_used++;
 }
 
-static
-void addEvent_G ( ClgState* clgs, InstrInfo* inode )
+static void addEvent_G(ClgState* clgs, InstrInfo* inode)
 {
    Event* evt;
-   if (!TG_(clo).collect_bus) return;
+   if (!TG_(clo).collect_bus)
+      return;
 
    if (clgs->events_used == N_EVENTS)
       flushEvents(clgs);
    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    evt = &clgs->events[clgs->events_used];
    init_Event(evt);
-   evt->tag       = Ev_G;
-   evt->inode     = inode;
+   evt->tag   = Ev_G;
+   evt->inode = inode;
    clgs->events_used++;
 }
 
@@ -773,22 +764,20 @@ void addEvent_G ( ClgState* clgs, InstrInfo* inode )
    instructions. The event set is extended as required on flush of the event
    queue (when Dm events were determined), cost offsets are determined at
    end of BB instrumentation. */
-static
-InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
+static InstrInfo* next_InstrInfo(ClgState* clgs, UInt instr_size)
 {
    InstrInfo* ii;
    tl_assert(clgs->ii_index < clgs->bb->instr_count);
-   ii = &clgs->bb->instr[ clgs->ii_index ];
+   ii = &clgs->bb->instr[clgs->ii_index];
 
    if (clgs->seen_before) {
-       TG_ASSERT(ii->instr_offset == clgs->instr_offset);
-       TG_ASSERT(ii->instr_size == instr_size);
-   }
-   else {
-       ii->instr_offset = clgs->instr_offset;
-       ii->instr_size = instr_size;
-       ii->cost_offset = 0;
-       ii->eventset = 0;
+      TG_ASSERT(ii->instr_offset == clgs->instr_offset);
+      TG_ASSERT(ii->instr_size == instr_size);
+   } else {
+      ii->instr_offset = clgs->instr_offset;
+      ii->instr_size   = instr_size;
+      ii->cost_offset  = 0;
+      ii->eventset     = 0;
    }
 
    clgs->ii_index++;
@@ -799,24 +788,23 @@ InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
 }
 
 // return total number of cost values needed for this BB
-static
-UInt update_cost_offsets( ClgState* clgs )
+static UInt update_cost_offsets(ClgState* clgs)
 {
-    Int i;
-    InstrInfo* ii;
-    UInt cost_offset = 0;
-
-    TG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
-    for(i=0; i<clgs->ii_index; i++) {
-	ii = &clgs->bb->instr[i];
-	if (clgs->seen_before) {
-	    TG_ASSERT(ii->cost_offset == cost_offset);
-	} else
-	    ii->cost_offset = cost_offset;
-	cost_offset += ii->eventset ? ii->eventset->size : 0;
-    }
-
-    return cost_offset;
+   Int        i;
+   InstrInfo* ii;
+   UInt       cost_offset = 0;
+
+   TG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
+   for (i = 0; i < clgs->ii_index; i++) {
+      ii = &clgs->bb->instr[i];
+      if (clgs->seen_before) {
+         TG_ASSERT(ii->cost_offset == cost_offset);
+      } else
+         ii->cost_offset = cost_offset;
+      cost_offset += ii->eventset ? ii->eventset->size : 0;
+   }
+
+   return cost_offset;
 }
 
 /*------------------------------------------------------------*/
@@ -824,30 +812,27 @@ UInt update_cost_offsets( ClgState* clgs )
 /*------------------------------------------------------------*/
 
 #if defined(VG_BIGENDIAN)
-# define CLGEndness Iend_BE
+#define CLGEndness Iend_BE
 #elif defined(VG_LITTLEENDIAN)
-# define CLGEndness Iend_LE
+#define CLGEndness Iend_LE
 #else
-# error "Unknown endianness"
+#error "Unknown endianness"
 #endif
 
-static
-Addr IRConst2Addr(IRConst* con)
+static Addr IRConst2Addr(IRConst* con)
 {
-    Addr addr;
-
-    if (sizeof(RegWord) == 4) {
-	TG_ASSERT( con->tag == Ico_U32 );
-	addr = con->Ico.U32;
-    }
-    else if (sizeof(RegWord) == 8) {
-	TG_ASSERT( con->tag == Ico_U64 );
-	addr = con->Ico.U64;
-    }
-    else
-	VG_(tool_panic)("Tracegrind: invalid Addr type");
-
-    return addr;
+   Addr addr;
+
+   if (sizeof(RegWord) == 4) {
+      TG_ASSERT(con->tag == Ico_U32);
+      addr = con->Ico.U32;
+   } else if (sizeof(RegWord) == 8) {
+      TG_ASSERT(con->tag == Ico_U64);
+      addr = con->Ico.U64;
+   } else
+      VG_(tool_panic)("Tracegrind: invalid Addr type");
+
+   return addr;
 }
 
 /* First pass over a BB to instrument, counting instructions and jumps
@@ -855,62 +840,62 @@ Addr IRConst2Addr(IRConst* con)
  *
  * Called from TG_(get_bb)
  */
-void TG_(collectBlockInfo)(IRSB* sbIn,
-			    /*INOUT*/ UInt* instrs,
-			    /*INOUT*/ UInt* cjmps,
-			    /*INOUT*/ Bool* cjmp_inverted)
+void TG_(collectBlockInfo)(IRSB*           sbIn,
+                           /*INOUT*/ UInt* instrs,
+                           /*INOUT*/ UInt* cjmps,
+                           /*INOUT*/ Bool* cjmp_inverted)
 {
-    Int i;
-    IRStmt* st;
-    Addr instrAddr =0, jumpDst;
-    UInt instrLen = 0;
-    Bool toNextInstr = False;
-
-    // Ist_Exit has to be ignored in preamble code, before first IMark:
-    // preamble code is added by VEX for self modifying code, and has
-    // nothing to do with client code
-    Bool inPreamble = True;
-
-    if (!sbIn) return;
-
-    for (i = 0; i < sbIn->stmts_used; i++) {
-	  st = sbIn->stmts[i];
-	  if (Ist_IMark == st->tag) {
-	      inPreamble = False;
-
-	      instrAddr = st->Ist.IMark.addr;
-	      instrLen  = st->Ist.IMark.len;
-
-	      (*instrs)++;
-	      toNextInstr = False;
-	  }
-	  if (inPreamble) continue;
-	  if (Ist_Exit == st->tag) {
-	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
-	      toNextInstr =  (jumpDst == instrAddr + instrLen);
-
-	      (*cjmps)++;
-	  }
-    }
-
-    /* if the last instructions of BB conditionally jumps to next instruction
-     * (= first instruction of next BB in memory), this is a inverted by VEX.
-     */
-    *cjmp_inverted = toNextInstr;
+   Int     i;
+   IRStmt* st;
+   Addr    instrAddr   = 0, jumpDst;
+   UInt    instrLen    = 0;
+   Bool    toNextInstr = False;
+
+   // Ist_Exit has to be ignored in preamble code, before first IMark:
+   // preamble code is added by VEX for self modifying code, and has
+   // nothing to do with client code
+   Bool inPreamble = True;
+
+   if (!sbIn)
+      return;
+
+   for (i = 0; i < sbIn->stmts_used; i++) {
+      st = sbIn->stmts[i];
+      if (Ist_IMark == st->tag) {
+         inPreamble = False;
+
+         instrAddr = st->Ist.IMark.addr;
+         instrLen  = st->Ist.IMark.len;
+
+         (*instrs)++;
+         toNextInstr = False;
+      }
+      if (inPreamble)
+         continue;
+      if (Ist_Exit == st->tag) {
+         jumpDst     = IRConst2Addr(st->Ist.Exit.dst);
+         toNextInstr = (jumpDst == instrAddr + instrLen);
+
+         (*cjmps)++;
+      }
+   }
+
+   /* if the last instructions of BB conditionally jumps to next instruction
+    * (= first instruction of next BB in memory), this is a inverted by VEX.
+    */
+   *cjmp_inverted = toNextInstr;
 }
 
-static
-void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
+static void
+addConstMemStoreStmt(IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
 {
-    addStmtToIRSB( bbOut,
-		   IRStmt_Store(CLGEndness,
-				IRExpr_Const(hWordTy == Ity_I32 ?
-					     IRConst_U32( addr ) :
-					     IRConst_U64( addr )),
-				IRExpr_Const(IRConst_U32(val)) ));
+   addStmtToIRSB(
+      bbOut, IRStmt_Store(CLGEndness,
+                          IRExpr_Const(hWordTy == Ity_I32 ? IRConst_U32(addr)
+                                                          : IRConst_U64(addr)),
+                          IRExpr_Const(IRConst_U32(val))));
 }
 
-
 /* add helper call to setup_bbcc, with pointer to BB struct as argument
  *
  * precondition for setup_bbcc:
@@ -931,28 +916,25 @@ void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
  *   set current_bbcc to BBCC that gets the costs for this BB execution
  *   attached
  */
-static
-void addBBSetupCall(ClgState* clgs)
+static void addBBSetupCall(ClgState* clgs)
 {
    IRDirty* di;
-   IRExpr  *arg1, **argv;
+   IRExpr * arg1, **argv;
 
-   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
+   arg1 = mkIRExpr_HWord((HWord)clgs->bb);
    argv = mkIRExprVec_1(arg1);
-   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
-			      VG_(fnptr_to_fnentry)( & TG_(setup_bbcc) ),
-			      argv);
-   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+   di   = unsafeIRDirty_0_N(1, "setup_bbcc",
+                            VG_(fnptr_to_fnentry)(&TG_(setup_bbcc)), argv);
+   addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di));
 }
 
-
-static
-IRSB* TG_(instrument)( VgCallbackClosure* closure,
-                        IRSB* sbIn,
-			const VexGuestLayout* layout,
-			const VexGuestExtents* vge,
-                        const VexArchInfo* archinfo_host,
-			IRType gWordTy, IRType hWordTy )
+static IRSB* TG_(instrument)(VgCallbackClosure*     closure,
+                             IRSB*                  sbIn,
+                             const VexGuestLayout*  layout,
+                             const VexGuestExtents* vge,
+                             const VexArchInfo*     archinfo_host,
+                             IRType                 gWordTy,
+                             IRType                 hWordTy)
 {
    Int        i;
    IRStmt*    st;
@@ -960,7 +942,7 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
    InstrInfo* curr_inode = NULL;
    ClgState   clgs;
    UInt       cJumps = 0;
-   IRTypeEnv* tyenv = sbIn->tyenv;
+   IRTypeEnv* tyenv  = sbIn->tyenv;
 
    if (gWordTy != hWordTy) {
       /* We don't currently support this case. */
@@ -968,10 +950,10 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
    }
 
    // No instrumentation if it is switched off
-   if (! TG_(instrument_state)) {
-       TG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
-		 (Addr)closure->readdr);
-       return sbIn;
+   if (!TG_(instrument_state)) {
+      TG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
+               (Addr)closure->readdr);
+      return sbIn;
    }
 
    TG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
@@ -982,19 +964,19 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
    // Copy verbatim any IR preamble preceding the first IMark
    i = 0;
    while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
-      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
+      addStmtToIRSB(clgs.sbOut, sbIn->stmts[i]);
       i++;
    }
 
    // Get the first statement, and origAddr from it
-   TG_ASSERT(sbIn->stmts_used >0);
+   TG_ASSERT(sbIn->stmts_used > 0);
    TG_ASSERT(i < sbIn->stmts_used);
    st = sbIn->stmts[i];
    TG_ASSERT(Ist_IMark == st->tag);
 
    origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta;
-   TG_ASSERT(origAddr == st->Ist.IMark.addr
-                          + st->Ist.IMark.delta);  // XXX: check no overflow
+   TG_ASSERT(origAddr == st->Ist.IMark.addr +
+                            st->Ist.IMark.delta); // XXX: check no overflow
 
    /* Get BB struct (creating if necessary).
     * JS: The hash table is keyed with orig_addr_noredir -- important!
@@ -1006,8 +988,8 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
    addBBSetupCall(&clgs);
 
    // Set up running state
-   clgs.events_used = 0;
-   clgs.ii_index = 0;
+   clgs.events_used  = 0;
+   clgs.ii_index     = 0;
    clgs.instr_offset = 0;
 
    for (/*use current i*/; i < sbIn->stmts_used; i++) {
@@ -1016,252 +998,250 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
       TG_ASSERT(isFlatIRStmt(st));
 
       switch (st->tag) {
-	 case Ist_NoOp:
-	 case Ist_AbiHint:
-	 case Ist_Put:
-	 case Ist_PutI:
-	 case Ist_MBE:
-	    break;
-
-	 case Ist_IMark: {
-            Addr   cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
-            UInt   isize = st->Ist.IMark.len;
-            TG_ASSERT(clgs.instr_offset == cia - origAddr);
-	    // If Vex fails to decode an instruction, the size will be zero.
-	    // Pretend otherwise.
-	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
-
-	    // Sanity-check size.
-	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
-		     || VG_CLREQ_SZB == isize );
-
-	    // Init the inode, record it as the current one.
-	    // Subsequent Dr/Dw/Dm events from the same instruction will
-	    // also use it.
-	    curr_inode = next_InstrInfo (&clgs, isize);
-
-	    addEvent_Ir( &clgs, curr_inode );
-	    break;
-	 }
-
-	 case Ist_WrTmp: {
-	    IRExpr* data = st->Ist.WrTmp.data;
-	    if (data->tag == Iex_Load) {
-	       IRExpr* aexpr = data->Iex.Load.addr;
-	       // Note also, endianness info is ignored.  I guess
-	       // that's not interesting.
-	       addEvent_Dr( &clgs, curr_inode,
-			    sizeofIRType(data->Iex.Load.ty), aexpr );
-	    }
-	    break;
-	 }
-
-	 case Ist_Store: {
-	    IRExpr* data  = st->Ist.Store.data;
-	    IRExpr* aexpr = st->Ist.Store.addr;
-	    addEvent_Dw( &clgs, curr_inode,
-			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
-	    break;
-	 }
-
-         case Ist_StoreG: {
-            IRStoreG* sg   = st->Ist.StoreG.details;
-            IRExpr*   data = sg->data;
-            IRExpr*   addr = sg->addr;
-            IRType    type = typeOfIRExpr(tyenv, data);
-            tl_assert(type != Ity_INVALID);
-            addEvent_D_guarded( &clgs, curr_inode,
-                                sizeofIRType(type), addr, sg->guard,
-                                True/*isWrite*/ );
-            break;
+      case Ist_NoOp:
+      case Ist_AbiHint:
+      case Ist_Put:
+      case Ist_PutI:
+      case Ist_MBE:
+         break;
+
+      case Ist_IMark: {
+         Addr cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
+         UInt isize = st->Ist.IMark.len;
+         TG_ASSERT(clgs.instr_offset == cia - origAddr);
+         // If Vex fails to decode an instruction, the size will be zero.
+         // Pretend otherwise.
+         if (isize == 0)
+            isize = VG_MIN_INSTR_SZB;
+
+         // Sanity-check size.
+         tl_assert((VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB) ||
+                   VG_CLREQ_SZB == isize);
+
+         // Init the inode, record it as the current one.
+         // Subsequent Dr/Dw/Dm events from the same instruction will
+         // also use it.
+         curr_inode = next_InstrInfo(&clgs, isize);
+
+         addEvent_Ir(&clgs, curr_inode);
+         break;
+      }
+
+      case Ist_WrTmp: {
+         IRExpr* data = st->Ist.WrTmp.data;
+         if (data->tag == Iex_Load) {
+            IRExpr* aexpr = data->Iex.Load.addr;
+            // Note also, endianness info is ignored.  I guess
+            // that's not interesting.
+            addEvent_Dr(&clgs, curr_inode, sizeofIRType(data->Iex.Load.ty),
+                        aexpr);
          }
+         break;
+      }
 
-         case Ist_LoadG: {
-            IRLoadG* lg       = st->Ist.LoadG.details;
-            IRType   type     = Ity_INVALID; /* loaded type */
-            IRType   typeWide = Ity_INVALID; /* after implicit widening */
-            IRExpr*  addr     = lg->addr;
-            typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
-            tl_assert(type != Ity_INVALID);
-            addEvent_D_guarded( &clgs, curr_inode,
-                                sizeofIRType(type), addr, lg->guard,
-                                False/*!isWrite*/ );
-            break;
+      case Ist_Store: {
+         IRExpr* data  = st->Ist.Store.data;
+         IRExpr* aexpr = st->Ist.Store.addr;
+         addEvent_Dw(&clgs, curr_inode,
+                     sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr);
+         break;
+      }
+
+      case Ist_StoreG: {
+         IRStoreG* sg   = st->Ist.StoreG.details;
+         IRExpr*   data = sg->data;
+         IRExpr*   addr = sg->addr;
+         IRType    type = typeOfIRExpr(tyenv, data);
+         tl_assert(type != Ity_INVALID);
+         addEvent_D_guarded(&clgs, curr_inode, sizeofIRType(type), addr,
+                            sg->guard, True /*isWrite*/);
+         break;
+      }
+
+      case Ist_LoadG: {
+         IRLoadG* lg       = st->Ist.LoadG.details;
+         IRType   type     = Ity_INVALID; /* loaded type */
+         IRType   typeWide = Ity_INVALID; /* after implicit widening */
+         IRExpr*  addr     = lg->addr;
+         typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
+         tl_assert(type != Ity_INVALID);
+         addEvent_D_guarded(&clgs, curr_inode, sizeofIRType(type), addr,
+                            lg->guard, False /*!isWrite*/);
+         break;
+      }
+
+      case Ist_Dirty: {
+         Int      dataSize;
+         IRDirty* d = st->Ist.Dirty.details;
+         if (d->mFx != Ifx_None) {
+            /* This dirty helper accesses memory.  Collect the details. */
+            tl_assert(d->mAddr != NULL);
+            tl_assert(d->mSize != 0);
+            dataSize = d->mSize;
+            // Large (eg. 28B, 108B, 512B on x86) data-sized
+            // instructions will be done inaccurately, but they're
+            // very rare and this avoids errors from hitting more
+            // than two cache lines in the simulation.
+            if (TG_(clo).simulate_cache && dataSize > TG_(min_line_size))
+               dataSize = TG_(min_line_size);
+            if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+               addEvent_Dr(&clgs, curr_inode, dataSize, d->mAddr);
+            if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+               addEvent_Dw(&clgs, curr_inode, dataSize, d->mAddr);
+         } else {
+            tl_assert(d->mAddr == NULL);
+            tl_assert(d->mSize == 0);
+         }
+         break;
+      }
+
+      case Ist_CAS: {
+         /* We treat it as a read and a write of the location.  I
+            think that is the same behaviour as it was before IRCAS
+            was introduced, since prior to that point, the Vex
+            front ends would translate a lock-prefixed instruction
+            into a (normal) read followed by a (normal) write. */
+         Int    dataSize;
+         IRCAS* cas = st->Ist.CAS.details;
+         TG_ASSERT(cas->addr && isIRAtom(cas->addr));
+         TG_ASSERT(cas->dataLo);
+         dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
+         if (cas->dataHi != NULL)
+            dataSize *= 2; /* since this is a doubleword-cas */
+         addEvent_Dr(&clgs, curr_inode, dataSize, cas->addr);
+         addEvent_Dw(&clgs, curr_inode, dataSize, cas->addr);
+         addEvent_G(&clgs, curr_inode);
+         break;
+      }
+
+      case Ist_LLSC: {
+         IRType dataTy;
+         if (st->Ist.LLSC.storedata == NULL) {
+            /* LL */
+            dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
+            addEvent_Dr(&clgs, curr_inode, sizeofIRType(dataTy),
+                        st->Ist.LLSC.addr);
+            /* flush events before LL, should help SC to succeed */
+            flushEvents(&clgs);
+         } else {
+            /* SC */
+            dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
+            addEvent_Dw(&clgs, curr_inode, sizeofIRType(dataTy),
+                        st->Ist.LLSC.addr);
+            /* I don't know whether the global-bus-lock cost should
+               be attributed to the LL or the SC, but it doesn't
+               really matter since they always have to be used in
+               pairs anyway.  Hence put it (quite arbitrarily) on
+               the SC. */
+            addEvent_G(&clgs, curr_inode);
          }
+         break;
+      }
 
-	 case Ist_Dirty: {
-	    Int      dataSize;
-	    IRDirty* d = st->Ist.Dirty.details;
-	    if (d->mFx != Ifx_None) {
-	       /* This dirty helper accesses memory.  Collect the details. */
-	       tl_assert(d->mAddr != NULL);
-	       tl_assert(d->mSize != 0);
-	       dataSize = d->mSize;
-	       // Large (eg. 28B, 108B, 512B on x86) data-sized
-	       // instructions will be done inaccurately, but they're
-	       // very rare and this avoids errors from hitting more
-	       // than two cache lines in the simulation.
-	       if (TG_(clo).simulate_cache && dataSize > TG_(min_line_size))
-		  dataSize = TG_(min_line_size);
-	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
-		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
-	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
-		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
-	    } else {
-	       tl_assert(d->mAddr == NULL);
-	       tl_assert(d->mSize == 0);
-	    }
-	    break;
-	 }
-
-         case Ist_CAS: {
-            /* We treat it as a read and a write of the location.  I
-               think that is the same behaviour as it was before IRCAS
-               was introduced, since prior to that point, the Vex
-               front ends would translate a lock-prefixed instruction
-               into a (normal) read followed by a (normal) write. */
-            Int    dataSize;
-            IRCAS* cas = st->Ist.CAS.details;
-            TG_ASSERT(cas->addr && isIRAtom(cas->addr));
-            TG_ASSERT(cas->dataLo);
-            dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
-            if (cas->dataHi != NULL)
-               dataSize *= 2; /* since this is a doubleword-cas */
-            addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
-            addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
-            addEvent_G(  &clgs, curr_inode );
-            break;
+      case Ist_Exit: {
+         Bool guest_exit, inverted;
+
+         /* VEX code generation sometimes inverts conditional branches.
+          * As Tracegrind counts (conditional) jumps, it has to correct
+          * inversions. The heuristic is the following:
+          * (1) Tracegrind switches off SB chasing and unrolling, and
+          *     therefore it assumes that a candidate for inversion only is
+          *     the last conditional branch in an SB.
+          * (2) inversion is assumed if the branch jumps to the address of
+          *     the next guest instruction in memory.
+          * This heuristic is precalculated in TG_(collectBlockInfo)().
+          *
+          * Branching behavior is also used for branch prediction. Note that
+          * above heuristic is different from what Cachegrind does.
+          * Cachegrind uses (2) for all branches.
+          */
+         if (cJumps + 1 == clgs.bb->cjmp_count)
+            inverted = clgs.bb->cjmp_inverted;
+         else
+            inverted = False;
+
+         // call branch predictor only if this is a branch in guest code
+         guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
+                      (st->Ist.Exit.jk == Ijk_Call) ||
+                      (st->Ist.Exit.jk == Ijk_Ret);
+
+         if (guest_exit) {
+            /* Stuff to widen the guard expression to a host word, so
+               we can pass it to the branch predictor simulation
+               functions easily. */
+            IRType  tyW    = hWordTy;
+            IROp    widen  = tyW == Ity_I32 ? Iop_1Uto32 : Iop_1Uto64;
+            IROp    opXOR  = tyW == Ity_I32 ? Iop_Xor32 : Iop_Xor64;
+            IRTemp  guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
+            IRTemp  guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
+            IRTemp  guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
+            IRExpr* one    = tyW == Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+                                            : IRExpr_Const(IRConst_U64(1));
+
+            /* Widen the guard expression. */
+            addStmtToIRSB(clgs.sbOut, IRStmt_WrTmp(guard1, st->Ist.Exit.guard));
+            addStmtToIRSB(
+               clgs.sbOut,
+               IRStmt_WrTmp(guardW, IRExpr_Unop(widen, IRExpr_RdTmp(guard1))));
+            /* If the exit is inverted, invert the sense of the guard. */
+            addStmtToIRSB(
+               clgs.sbOut,
+               IRStmt_WrTmp(guard,
+                            inverted
+                               ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+                               : IRExpr_RdTmp(guardW)));
+            /* And post the event. */
+            addEvent_Bc(&clgs, curr_inode, IRExpr_RdTmp(guard));
          }
 
-         case Ist_LLSC: {
-            IRType dataTy;
-            if (st->Ist.LLSC.storedata == NULL) {
-               /* LL */
-               dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
-               addEvent_Dr( &clgs, curr_inode,
-                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
-               /* flush events before LL, should help SC to succeed */
-               flushEvents( &clgs );
-            } else {
-               /* SC */
-               dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
-               addEvent_Dw( &clgs, curr_inode,
-                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
-               /* I don't know whether the global-bus-lock cost should
-                  be attributed to the LL or the SC, but it doesn't
-                  really matter since they always have to be used in
-                  pairs anyway.  Hence put it (quite arbitrarily) on
-                  the SC. */
-               addEvent_G(  &clgs, curr_inode );
+         /* We may never reach the next statement, so need to flush
+            all outstanding transactions now. */
+         flushEvents(&clgs);
+
+         TG_ASSERT(clgs.ii_index > 0);
+         if (!clgs.seen_before) {
+            TgJumpKind jk;
+
+            if (st->Ist.Exit.jk == Ijk_Call)
+               jk = jk_Call;
+            else if (st->Ist.Exit.jk == Ijk_Ret)
+               jk = jk_Return;
+            else {
+               if (IRConst2Addr(st->Ist.Exit.dst) ==
+                   origAddr + curr_inode->instr_offset + curr_inode->instr_size)
+                  jk = jk_None;
+               else
+                  jk = jk_Jump;
             }
-            break;
+
+            clgs.bb->jmp[cJumps].instr   = clgs.ii_index - 1;
+            clgs.bb->jmp[cJumps].jmpkind = jk;
          }
 
- 	 case Ist_Exit: {
-            Bool guest_exit, inverted;
-
-            /* VEX code generation sometimes inverts conditional branches.
-             * As Tracegrind counts (conditional) jumps, it has to correct
-             * inversions. The heuristic is the following:
-             * (1) Tracegrind switches off SB chasing and unrolling, and
-             *     therefore it assumes that a candidate for inversion only is
-             *     the last conditional branch in an SB.
-             * (2) inversion is assumed if the branch jumps to the address of
-             *     the next guest instruction in memory.
-             * This heuristic is precalculated in TG_(collectBlockInfo)().
-             *
-             * Branching behavior is also used for branch prediction. Note that
-             * above heuristic is different from what Cachegrind does.
-             * Cachegrind uses (2) for all branches.
-             */
-            if (cJumps+1 == clgs.bb->cjmp_count)
-                inverted = clgs.bb->cjmp_inverted;
-            else
-                inverted = False;
-
-            // call branch predictor only if this is a branch in guest code
-            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
-                         (st->Ist.Exit.jk == Ijk_Call) ||
-                         (st->Ist.Exit.jk == Ijk_Ret);
-
-            if (guest_exit) {
-                /* Stuff to widen the guard expression to a host word, so
-                   we can pass it to the branch predictor simulation
-                   functions easily. */
-                IRType   tyW    = hWordTy;
-                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
-                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
-                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
-                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
-                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
-                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
-                                               : IRExpr_Const(IRConst_U64(1));
-
-                /* Widen the guard expression. */
-                addStmtToIRSB( clgs.sbOut,
-                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
-                addStmtToIRSB( clgs.sbOut,
-                               IRStmt_WrTmp( guardW,
-                                             IRExpr_Unop(widen,
-                                                         IRExpr_RdTmp(guard1))) );
-                /* If the exit is inverted, invert the sense of the guard. */
-                addStmtToIRSB(
-                        clgs.sbOut,
-                        IRStmt_WrTmp(
-                                guard,
-                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
-                                    : IRExpr_RdTmp(guardW)
-                                    ));
-                /* And post the event. */
-                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
-            }
+         /* Update global variable jmps_passed before the jump
+          * A correction is needed if VEX inverted the last jump condition
+          */
+         UInt val = inverted ? cJumps + 1 : cJumps;
+         addConstMemStoreStmt(
+            clgs.sbOut, (UWord)&TG_(current_state).jmps_passed, val, hWordTy);
+         cJumps++;
 
-	    /* We may never reach the next statement, so need to flush
-	       all outstanding transactions now. */
-	    flushEvents( &clgs );
-
-	    TG_ASSERT(clgs.ii_index>0);
-	    if (!clgs.seen_before) {
-	      TgJumpKind jk;
-
-	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
-	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
-	      else {
-		if (IRConst2Addr(st->Ist.Exit.dst) ==
-		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
-		  jk = jk_None;
-		else
-		  jk = jk_Jump;
-	      }
-
-	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
-	      clgs.bb->jmp[cJumps].jmpkind = jk;
-	    }
-
-	    /* Update global variable jmps_passed before the jump
-	     * A correction is needed if VEX inverted the last jump condition
-	    */
-	    UInt val = inverted ? cJumps+1 : cJumps;
-	    addConstMemStoreStmt( clgs.sbOut,
-				  (UWord) &TG_(current_state).jmps_passed,
-				  val, hWordTy);
-	    cJumps++;
-
-	    break;
-	 }
-
-	 default:
-	    tl_assert(0);
-	    break;
+         break;
+      }
+
+      default:
+         tl_assert(0);
+         break;
       }
 
       /* Copy the original statement */
-      addStmtToIRSB( clgs.sbOut, st );
+      addStmtToIRSB(clgs.sbOut, st);
 
-      TG_DEBUGIF(5) {
-	 VG_(printf)("   pass  ");
-	 ppIRStmt(st);
-	 VG_(printf)("\n");
+      TG_DEBUGIF(5)
+      {
+         VG_(printf)("   pass  ");
+         ppIRStmt(st);
+         VG_(printf)("\n");
       }
    }
 
@@ -1269,93 +1249,97 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
       which are function returns as we assume the return stack
       predictor never mispredicts. */
    if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
-      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
+      if (0) {
+         ppIRExpr(sbIn->next);
+         VG_(printf)("\n");
+      }
       switch (sbIn->next->tag) {
-         case Iex_Const:
-            break; /* boring - branch to known address */
-         case Iex_RdTmp:
-            /* looks like an indirect branch (branch to unknown) */
-            addEvent_Bi( &clgs, curr_inode, sbIn->next );
-            break;
-         default:
-            /* shouldn't happen - if the incoming IR is properly
-               flattened, should only have tmp and const cases to
-               consider. */
-            tl_assert(0);
+      case Iex_Const:
+         break; /* boring - branch to known address */
+      case Iex_RdTmp:
+         /* looks like an indirect branch (branch to unknown) */
+         addEvent_Bi(&clgs, curr_inode, sbIn->next);
+         break;
+      default:
+         /* shouldn't happen - if the incoming IR is properly
+            flattened, should only have tmp and const cases to
+            consider. */
+         tl_assert(0);
       }
    }
 
    /* At the end of the bb.  Flush outstandings. */
-   flushEvents( &clgs );
+   flushEvents(&clgs);
 
    /* Update global variable jmps_passed at end of SB.
     * As TG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
     * this can be omitted if there is no conditional jump in this SB.
     * A correction is needed if VEX inverted the last jump condition
     */
-   if (cJumps>0) {
+   if (cJumps > 0) {
       UInt jmps_passed = cJumps;
-      if (clgs.bb->cjmp_inverted) jmps_passed--;
-      addConstMemStoreStmt( clgs.sbOut,
-			    (UWord) &TG_(current_state).jmps_passed,
-			    jmps_passed, hWordTy);
+      if (clgs.bb->cjmp_inverted)
+         jmps_passed--;
+      addConstMemStoreStmt(clgs.sbOut, (UWord)&TG_(current_state).jmps_passed,
+                           jmps_passed, hWordTy);
    }
    TG_ASSERT(clgs.bb->cjmp_count == cJumps);
    TG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
 
    /* Info for final exit from BB */
    {
-     TgJumpKind jk;
-
-     if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
-     else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
-     else {
-       jk = jk_Jump;
-       if ((sbIn->next->tag == Iex_Const) &&
-	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
-	    origAddr + clgs.instr_offset))
-	 jk = jk_None;
-     }
-     clgs.bb->jmp[cJumps].jmpkind = jk;
-     /* Instruction index of the call/ret at BB end
-      * (it is wrong for fall-through, but does not matter) */
-     clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
+      TgJumpKind jk;
+
+      if (sbIn->jumpkind == Ijk_Call)
+         jk = jk_Call;
+      else if (sbIn->jumpkind == Ijk_Ret)
+         jk = jk_Return;
+      else {
+         jk = jk_Jump;
+         if ((sbIn->next->tag == Iex_Const) &&
+             (IRConst2Addr(sbIn->next->Iex.Const.con) ==
+              origAddr + clgs.instr_offset))
+            jk = jk_None;
+      }
+      clgs.bb->jmp[cJumps].jmpkind = jk;
+      /* Instruction index of the call/ret at BB end
+       * (it is wrong for fall-through, but does not matter) */
+      clgs.bb->jmp[cJumps].instr = clgs.ii_index - 1;
    }
 
    /* swap information of last exit with final exit if inverted */
    if (clgs.bb->cjmp_inverted) {
-     TgJumpKind jk;
-     UInt instr;
-
-     jk = clgs.bb->jmp[cJumps].jmpkind;
-     clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
-     clgs.bb->jmp[cJumps-1].jmpkind = jk;
-     instr = clgs.bb->jmp[cJumps].instr;
-     clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
-     clgs.bb->jmp[cJumps-1].instr = instr;
+      TgJumpKind jk;
+      UInt       instr;
+
+      jk                               = clgs.bb->jmp[cJumps].jmpkind;
+      clgs.bb->jmp[cJumps].jmpkind     = clgs.bb->jmp[cJumps - 1].jmpkind;
+      clgs.bb->jmp[cJumps - 1].jmpkind = jk;
+      instr                            = clgs.bb->jmp[cJumps].instr;
+      clgs.bb->jmp[cJumps].instr       = clgs.bb->jmp[cJumps - 1].instr;
+      clgs.bb->jmp[cJumps - 1].instr   = instr;
    }
 
    if (clgs.seen_before) {
-       TG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
-       TG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
-   }
-   else {
-       clgs.bb->cost_count = update_cost_offsets(&clgs);
-       clgs.bb->instr_len = clgs.instr_offset;
+      TG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
+      TG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
+   } else {
+      clgs.bb->cost_count = update_cost_offsets(&clgs);
+      clgs.bb->instr_len  = clgs.instr_offset;
    }
 
    TG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
-	     origAddr, clgs.bb->instr_len,
-	     clgs.bb->cjmp_count, clgs.bb->cost_count);
-   if (cJumps>0) {
-       TG_DEBUG(3, "                     [ ");
-       for (i=0;i<cJumps;i++)
-	   TG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr);
-       TG_DEBUG(3, "], last inverted: %s \n",
-		 clgs.bb->cjmp_inverted ? "yes":"no");
+            origAddr, clgs.bb->instr_len, clgs.bb->cjmp_count,
+            clgs.bb->cost_count);
+   if (cJumps > 0) {
+      TG_DEBUG(3, "                     [ ");
+      for (i = 0; i < cJumps; i++)
+         TG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr);
+      TG_DEBUG(3, "], last inverted: %s \n",
+               clgs.bb->cjmp_inverted ? "yes" : "no");
    }
 
-  return clgs.sbOut;
+   return clgs.sbOut;
 }
 
 /*--------------------------------------------------------------------*/
@@ -1365,15 +1349,13 @@ IRSB* TG_(instrument)( VgCallbackClosure* closure,
 // Called when a translation is removed from the translation cache for
 // any reason at all: to free up space, because the guest code was
 // unmapped or modified, or for any arbitrary reason.
-static
-void tg_discard_superblock_info ( Addr orig_addr, VexGuestExtents vge )
+static void tg_discard_superblock_info(Addr orig_addr, VexGuestExtents vge)
 {
-    tl_assert(vge.n_used > 0);
+   tl_assert(vge.n_used > 0);
 
    if (0)
-      VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
-                   (void*)orig_addr,
-                   (void*)vge.base[0], (ULong)vge.len[0]);
+      VG_(printf)("discard_superblock_info: %p, %p, %llu\n", (void*)orig_addr,
+                  (void*)vge.base[0], (ULong)vge.len[0]);
 
    // Get BB info, remove from table, free BB info.  Simple!
    // When created, the BB is keyed by the first instruction address,
@@ -1382,203 +1364,201 @@ void tg_discard_superblock_info ( Addr orig_addr, VexGuestExtents vge )
    TG_(delete_bb)(vge.base[0]);
 }
 
-
 /*------------------------------------------------------------*/
 /*--- TG_(fini)() and related function                     ---*/
 /*------------------------------------------------------------*/
 
-
-
-static
-void unwind_thread(thread_info* t)
+static void unwind_thread(thread_info* t)
 {
-  /* unwind signal handlers */
-  while(TG_(current_state).sig !=0)
-    TG_(post_signal)(TG_(current_tid),TG_(current_state).sig);
+   /* unwind signal handlers */
+   while (TG_(current_state).sig != 0)
+      TG_(post_signal)(TG_(current_tid), TG_(current_state).sig);
 
-  /* unwind regular call stack */
-  while(TG_(current_call_stack).sp>0)
-    TG_(pop_call_stack)();
+   /* unwind regular call stack */
+   while (TG_(current_call_stack).sp > 0)
+      TG_(pop_call_stack)();
 
-  /* reset context and function stack for context generation */
-  TG_(init_exec_state)( &TG_(current_state) );
-  TG_(current_fn_stack).top = TG_(current_fn_stack).bottom;
+   /* reset context and function stack for context generation */
+   TG_(init_exec_state)(&TG_(current_state));
+   TG_(current_fn_stack).top = TG_(current_fn_stack).bottom;
 }
 
-static
-void zero_state_cost(thread_info* t)
+static void zero_state_cost(thread_info* t)
 {
-    TG_(zero_cost)( TG_(sets).full, TG_(current_state).cost );
+   TG_(zero_cost)(TG_(sets).full, TG_(current_state).cost);
 }
 
 void TG_(set_instrument_state)(const HChar* reason, Bool state)
 {
-  if (TG_(instrument_state) == state) {
-    TG_DEBUG(2, "%s: instrumentation already %s\n",
-	     reason, state ? "ON" : "OFF");
-    return;
-  }
-  TG_(instrument_state) = state;
-  TG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
-	   reason, state ? "ON" : "OFF");
-
-  VG_(discard_translations_safely)( (Addr)0x1000, ~(SizeT)0xfff, "tracegrind");
-
-  /* reset internal state: call stacks, simulator */
-  TG_(forall_threads)(unwind_thread);
-  TG_(forall_threads)(zero_state_cost);
-  (*TG_(cachesim).clear)();
-
-  if (VG_(clo_verbosity) > 1)
-    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
-		 reason, state ? "ON" : "OFF");
+   if (TG_(instrument_state) == state) {
+      TG_DEBUG(2, "%s: instrumentation already %s\n", reason,
+               state ? "ON" : "OFF");
+      return;
+   }
+   TG_(instrument_state) = state;
+   TG_DEBUG(2, "%s: Switching instrumentation %s ...\n", reason,
+            state ? "ON" : "OFF");
+
+   VG_(discard_translations_safely)((Addr)0x1000, ~(SizeT)0xfff, "tracegrind");
+
+   /* reset internal state: call stacks, simulator */
+   TG_(forall_threads)(unwind_thread);
+   TG_(forall_threads)(zero_state_cost);
+   (*TG_(cachesim).clear)();
+
+   if (VG_(clo_verbosity) > 1)
+      VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n", reason,
+                   state ? "ON" : "OFF");
 }
 
 /* helper for dump_state_togdb */
 static void dump_state_of_thread_togdb(thread_info* ti)
 {
-    static FullCost sum = 0, tmp = 0;
-    Int t, i;
-    BBCC *from, *to;
-    call_entry* ce;
-    HChar *mcost;
-
-    t = TG_(current_tid);
-    TG_(init_cost_lz)( TG_(sets).full, &sum );
-    TG_(copy_cost_lz)( TG_(sets).full, &tmp, ti->lastdump_cost );
-    TG_(add_diff_cost)( TG_(sets).full, sum, ti->lastdump_cost,
-			 ti->states.entry[0]->cost);
-    TG_(copy_cost)( TG_(sets).full, ti->lastdump_cost, tmp );
-    mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
-    VG_(gdb_printf)("events-%d: %s\n", t, mcost);
-    VG_(free)(mcost);
-    VG_(gdb_printf)("frames-%d: %d\n", t, TG_(current_call_stack).sp);
-
-    ce = 0;
-    for(i = 0; i < TG_(current_call_stack).sp; i++) {
+   static FullCost sum = 0, tmp = 0;
+   Int             t, i;
+   BBCC *          from, *to;
+   call_entry*     ce;
+   HChar*          mcost;
+
+   t = TG_(current_tid);
+   TG_(init_cost_lz)(TG_(sets).full, &sum);
+   TG_(copy_cost_lz)(TG_(sets).full, &tmp, ti->lastdump_cost);
+   TG_(add_diff_cost)
+   (TG_(sets).full, sum, ti->lastdump_cost, ti->states.entry[0]->cost);
+   TG_(copy_cost)(TG_(sets).full, ti->lastdump_cost, tmp);
+   mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
+   VG_(gdb_printf)("events-%d: %s\n", t, mcost);
+   VG_(free)(mcost);
+   VG_(gdb_printf)("frames-%d: %d\n", t, TG_(current_call_stack).sp);
+
+   ce = 0;
+   for (i = 0; i < TG_(current_call_stack).sp; i++) {
       ce = TG_(get_call_entry)(i);
       /* if this frame is skipped, we don't have counters */
-      if (!ce->jcc) continue;
+      if (!ce->jcc)
+         continue;
 
       from = ce->jcc->from;
-      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
-      VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
+      VG_(gdb_printf)("function-%d-%d: %s\n", t, i, from->cxt->fn[0]->name);
+      VG_(gdb_printf)("calls-%d-%d: %llu\n", t, i, ce->jcc->call_counter);
 
       /* FIXME: EventSets! */
-      TG_(copy_cost)( TG_(sets).full, sum, ce->jcc->cost );
-      TG_(copy_cost)( TG_(sets).full, tmp, ce->enter_cost );
-      TG_(add_diff_cost)( TG_(sets).full, sum,
-			  ce->enter_cost, TG_(current_state).cost );
-      TG_(copy_cost)( TG_(sets).full, ce->enter_cost, tmp );
+      TG_(copy_cost)(TG_(sets).full, sum, ce->jcc->cost);
+      TG_(copy_cost)(TG_(sets).full, tmp, ce->enter_cost);
+      TG_(add_diff_cost)
+      (TG_(sets).full, sum, ce->enter_cost, TG_(current_state).cost);
+      TG_(copy_cost)(TG_(sets).full, ce->enter_cost, tmp);
 
       mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
-      VG_(gdb_printf)("events-%d-%d: %s\n",t, i, mcost);
+      VG_(gdb_printf)("events-%d-%d: %s\n", t, i, mcost);
       VG_(free)(mcost);
-    }
-    if (ce && ce->jcc) {
+   }
+   if (ce && ce->jcc) {
       to = ce->jcc->to;
-      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
-    }
+      VG_(gdb_printf)("function-%d-%d: %s\n", t, i, to->cxt->fn[0]->name);
+   }
 }
 
 /* Dump current state */
 static void dump_state_togdb(void)
 {
-    thread_info** th;
-    int t;
-    Int orig_tid = TG_(current_tid);
-
-    VG_(gdb_printf)("instrumentation: %s\n",
-		    TG_(instrument_state) ? "on":"off");
-    if (!TG_(instrument_state)) return;
-
-    VG_(gdb_printf)("executed-bbs: %llu\n", TG_(stat).bb_executions);
-    VG_(gdb_printf)("executed-calls: %llu\n", TG_(stat).call_counter);
-    VG_(gdb_printf)("distinct-bbs: %d\n", TG_(stat).distinct_bbs);
-    VG_(gdb_printf)("distinct-calls: %d\n", TG_(stat).distinct_jccs);
-    VG_(gdb_printf)("distinct-functions: %d\n", TG_(stat).distinct_fns);
-    VG_(gdb_printf)("distinct-contexts: %d\n", TG_(stat).distinct_contexts);
-
-    /* "events:" line. Given here because it will be dynamic in the future */
-    HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
-    VG_(gdb_printf)("events: %s\n", evmap);
-    VG_(free)(evmap);
-    /* Total cost summary */
-
-    /* threads */
-    th = TG_(get_threads)();
-    VG_(gdb_printf)("threads:");
-    for(t=1;t<VG_N_THREADS;t++) {
-	if (!th[t]) continue;
-	VG_(gdb_printf)(" %d", t);
-    }
-    VG_(gdb_printf)("\n");
-    VG_(gdb_printf)("current-tid: %d\n", orig_tid);
-    TG_(forall_threads)(dump_state_of_thread_togdb);
+   thread_info** th;
+   int           t;
+   Int           orig_tid = TG_(current_tid);
+
+   VG_(gdb_printf)("instrumentation: %s\n",
+                   TG_(instrument_state) ? "on" : "off");
+   if (!TG_(instrument_state))
+      return;
+
+   VG_(gdb_printf)("executed-bbs: %llu\n", TG_(stat).bb_executions);
+   VG_(gdb_printf)("executed-calls: %llu\n", TG_(stat).call_counter);
+   VG_(gdb_printf)("distinct-bbs: %d\n", TG_(stat).distinct_bbs);
+   VG_(gdb_printf)("distinct-calls: %d\n", TG_(stat).distinct_jccs);
+   VG_(gdb_printf)("distinct-functions: %d\n", TG_(stat).distinct_fns);
+   VG_(gdb_printf)("distinct-contexts: %d\n", TG_(stat).distinct_contexts);
+
+   /* "events:" line. Given here because it will be dynamic in the future */
+   HChar* evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+   VG_(gdb_printf)("events: %s\n", evmap);
+   VG_(free)(evmap);
+   /* Total cost summary */
+
+   /* threads */
+   th = TG_(get_threads)();
+   VG_(gdb_printf)("threads:");
+   for (t = 1; t < VG_N_THREADS; t++) {
+      if (!th[t])
+         continue;
+      VG_(gdb_printf)(" %d", t);
+   }
+   VG_(gdb_printf)("\n");
+   VG_(gdb_printf)("current-tid: %d\n", orig_tid);
+   TG_(forall_threads)(dump_state_of_thread_togdb);
 }
 
-
-static void print_monitor_help ( void )
+static void print_monitor_help(void)
 {
-   VG_(gdb_printf) ("\n");
-   VG_(gdb_printf) ("tracegrind monitor commands:\n");
-   VG_(gdb_printf) ("  status\n");
-   VG_(gdb_printf) ("        print status\n");
-   VG_(gdb_printf) ("  instrumentation [on|off]\n");
-   VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
-   VG_(gdb_printf) ("\n");
+   VG_(gdb_printf)("\n");
+   VG_(gdb_printf)("tracegrind monitor commands:\n");
+   VG_(gdb_printf)("  status\n");
+   VG_(gdb_printf)("        print status\n");
+   VG_(gdb_printf)("  instrumentation [on|off]\n");
+   VG_(gdb_printf)("        get/set (if on/off given) instrumentation state\n");
+   VG_(gdb_printf)("\n");
 }
 
 /* return True if request recognised, False otherwise */
-static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
+static Bool handle_gdb_monitor_command(ThreadId tid, const HChar* req)
 {
    HChar* wcmd;
-   HChar s[VG_(strlen)(req) + 1]; /* copy for strtok_r */
-   HChar *ssaveptr;
+   HChar  s[VG_(strlen)(req) + 1]; /* copy for strtok_r */
+   HChar* ssaveptr;
 
-   VG_(strcpy) (s, req);
+   VG_(strcpy)(s, req);
 
-   wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
-   switch (VG_(keyword_id) ("help status instrumentation",
-                            wcmd, kwd_report_duplicated_matches)) {
+   wcmd = VG_(strtok_r)(s, " ", &ssaveptr);
+   switch (VG_(keyword_id)("help status instrumentation", wcmd,
+                           kwd_report_duplicated_matches)) {
    case -2: /* multiple matches */
       return True;
    case -1: /* not found */
       return False;
-   case  0: /* help */
+   case 0: /* help */
       print_monitor_help();
       return True;
 
    case 1: { /* status */
-     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
-     if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
-       /* internal interface to tracegrind_control */
-       dump_state_togdb();
-       return True;
-     }
-
-     if (!TG_(instrument_state)) {
-       VG_(gdb_printf)("No status available as instrumentation is switched off\n");
-     } else {
-       // Status information to be improved ...
-       thread_info** th = TG_(get_threads)();
-       Int t, tcount = 0;
-       for(t=1;t<VG_N_THREADS;t++)
-	 if (th[t]) tcount++;
-       VG_(gdb_printf)("%d thread(s) running.\n", tcount);
-     }
-     return True;
+      HChar* arg = VG_(strtok_r)(0, " ", &ssaveptr);
+      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
+         /* internal interface to tracegrind_control */
+         dump_state_togdb();
+         return True;
+      }
+
+      if (!TG_(instrument_state)) {
+         VG_(gdb_printf)(
+            "No status available as instrumentation is switched off\n");
+      } else {
+         // Status information to be improved ...
+         thread_info** th = TG_(get_threads)();
+         Int           t, tcount = 0;
+         for (t = 1; t < VG_N_THREADS; t++)
+            if (th[t])
+               tcount++;
+         VG_(gdb_printf)("%d thread(s) running.\n", tcount);
+      }
+      return True;
    }
 
    case 2: { /* instrumentation */
-     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
-     if (!arg) {
-       VG_(gdb_printf)("instrumentation: %s\n",
-		       TG_(instrument_state) ? "on":"off");
-     }
-     else
-       TG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
-     return True;
+      HChar* arg = VG_(strtok_r)(0, " ", &ssaveptr);
+      if (!arg) {
+         VG_(gdb_printf)("instrumentation: %s\n",
+                         TG_(instrument_state) ? "on" : "off");
+      } else
+         TG_(set_instrument_state)("Command", VG_(strcmp)(arg, "off") != 0);
+      return True;
    }
 
    default:
@@ -1587,42 +1567,39 @@ static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
    }
 }
 
-static
-Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
+static Bool TG_(handle_client_request)(ThreadId tid, UWord* args, UWord* ret)
 {
-   if (!VG_IS_TOOL_USERREQ('C','T',args[0])
-       && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
+   if (!VG_IS_TOOL_USERREQ('C', 'T', args[0]) &&
+       VG_USERREQ__GDB_MONITOR_COMMAND != args[0])
       return False;
 
-   switch(args[0]) {
+   switch (args[0]) {
    case VG_USERREQ__TOGGLE_COLLECT:
-     TG_(current_state).collect = !TG_(current_state).collect;
-     TG_DEBUG(2, "Client Request: toggled collection state to %s\n",
-	      TG_(current_state).collect ? "ON" : "OFF");
-     *ret = 0;                 /* meaningless */
-     break;
-
-   case VG_USERREQ__ADD_MARKER:
-     {
-       const HChar *marker = (HChar*)args[1];
-       TG_DEBUG(2, "Client Request: add marker '%s'\n", marker);
-       TG_(trace_emit_marker)(tid, marker);
-       *ret = 0;                 /* meaningless */
-     }
-     break;
+      TG_(current_state).collect = !TG_(current_state).collect;
+      TG_DEBUG(2, "Client Request: toggled collection state to %s\n",
+               TG_(current_state).collect ? "ON" : "OFF");
+      *ret = 0; /* meaningless */
+      break;
+
+   case VG_USERREQ__ADD_MARKER: {
+      const HChar* marker = (HChar*)args[1];
+      TG_DEBUG(2, "Client Request: add marker '%s'\n", marker);
+      TG_(trace_emit_marker)(tid, marker);
+      *ret = 0; /* meaningless */
+   } break;
 
    case VG_USERREQ__START_INSTRUMENTATION:
-     TG_(set_instrument_state)("Client Request", True);
-     *ret = 0;                 /* meaningless */
-     break;
+      TG_(set_instrument_state)("Client Request", True);
+      *ret = 0; /* meaningless */
+      break;
 
    case VG_USERREQ__STOP_INSTRUMENTATION:
-     TG_(set_instrument_state)("Client Request", False);
-     *ret = 0;                 /* meaningless */
-     break;
+      TG_(set_instrument_state)("Client Request", False);
+      *ret = 0; /* meaningless */
+      break;
 
    case VG_USERREQ__GDB_MONITOR_COMMAND: {
-      Bool handled = handle_gdb_monitor_command (tid, (HChar*)args[1]);
+      Bool handled = handle_gdb_monitor_command(tid, (HChar*)args[1]);
       if (handled)
          *ret = 1;
       else
@@ -1631,9 +1608,9 @@ Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
    }
    case VG_USERREQ__DUMP_STATS:
    case VG_USERREQ__ZERO_STATS:
-     TG_DEBUG(2, "Client Request: ignoring  %llx\n", (ULong)args[0]);
-     *ret = 0;                 /* meaningless */
-     break;
+      TG_DEBUG(2, "Client Request: ignoring  %llx\n", (ULong)args[0]);
+      *ret = 0; /* meaningless */
+      break;
 
    default:
       VG_(message)(Vg_UserMsg,
@@ -1645,160 +1622,165 @@ Bool TG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
    return True;
 }
 
-
 /* Syscall Timing.  syscalltime[tid] is the time at which thread tid last
    started a syscall.  */
 
 /* struct vki_timespec syscalltime[VG_N_THREADS];
    Whatever the syscall we use to measure the syscall time, we convert to
    seconds and nanoseconds.  */
-struct vki_timespec *syscalltime;
-struct vki_timespec *syscallcputime;
-
+struct vki_timespec* syscalltime;
+struct vki_timespec* syscallcputime;
 
-static
-void collect_time (struct vki_timespec *systime, struct vki_timespec *syscputime)
+static void collect_time(struct vki_timespec* systime,
+                         struct vki_timespec* syscputime)
 {
-  switch (TG_(clo).collect_systime) {
-    default: tl_assert (0);
-    case systime_msec: {
-      UInt ms_timer = VG_(read_millisecond_timer)();
-      systime->tv_sec = ms_timer / 1000;
+   switch (TG_(clo).collect_systime) {
+   default:
+      tl_assert(0);
+   case systime_msec: {
+      UInt ms_timer    = VG_(read_millisecond_timer)();
+      systime->tv_sec  = ms_timer / 1000;
       systime->tv_nsec = (ms_timer % 1000) * 1000000L;
       break;
-    }
-    case systime_usec: {
+   }
+   case systime_usec: {
       struct vki_timeval tv_now;
       VG_(gettimeofday)(&tv_now, NULL);
-      systime->tv_sec = tv_now.tv_sec;
+      systime->tv_sec  = tv_now.tv_sec;
       systime->tv_nsec = tv_now.tv_usec * 1000;
       break;
-    }
+   }
    case systime_nsec:
-#  if defined(VGO_linux) || defined(VGO_solaris) || defined(VGO_freebsd)
+#if defined(VGO_linux) || defined(VGO_solaris) || defined(VGO_freebsd)
       VG_(clock_gettime)(systime, VKI_CLOCK_MONOTONIC);
       VG_(clock_gettime)(syscputime, VKI_CLOCK_THREAD_CPUTIME_ID);
 
-#  elif defined(VGO_darwin)
+#elif defined(VGO_darwin)
       tl_assert(0);
-#  else
-#     error "Unknown OS"
-#  endif
+#else
+#error "Unknown OS"
+#endif
       break;
-  }
+   }
 }
 
-static
-void TG_(pre_syscall)(ThreadId tid, UInt syscallno,
-                      UWord* args, UInt nArgs)
+static void
+TG_(pre_syscall)(ThreadId tid, UInt syscallno, UWord* args, UInt nArgs)
 {
-  /* Collect time for systime tracking if enabled */
-  if (TG_(clo).collect_systime != systime_no) {
-    collect_time(&syscalltime[tid],
-                 TG_(clo).collect_systime == systime_nsec ? &syscallcputime[tid] : NULL);
-  }
+   /* Collect time for systime tracking if enabled */
+   if (TG_(clo).collect_systime != systime_no) {
+      collect_time(&syscalltime[tid], TG_(clo).collect_systime == systime_nsec
+                                         ? &syscallcputime[tid]
+                                         : NULL);
+   }
 }
 
 /* Returns "after - before" in the unit as specified by --collect-systime.
-   after is supposed to be >= before, and tv_nsec must be >= 0 and < One_Second_In_Nsec.  */
-static
-ULong vki_timespec_diff (struct vki_timespec after, struct vki_timespec before)
+   after is supposed to be >= before, and tv_nsec must be >= 0 and <
+   One_Second_In_Nsec.  */
+static ULong vki_timespec_diff(struct vki_timespec after,
+                               struct vki_timespec before)
 {
-   vki_time_t diff_sec = after.tv_sec - before.tv_sec;
-   long diff_nsec = after.tv_nsec - before.tv_nsec;
-   ULong nsec_factor; // factor to convert the desired unit into nsec.
+   vki_time_t diff_sec  = after.tv_sec - before.tv_sec;
+   long       diff_nsec = after.tv_nsec - before.tv_nsec;
+   ULong      nsec_factor; // factor to convert the desired unit into nsec.
 
    if (diff_nsec < 0) {
       diff_sec--;
       diff_nsec += 1000000000ULL;
    }
-  switch (TG_(clo).collect_systime) {
-    case systime_no: tl_assert (0);
-    case systime_msec: nsec_factor = 1000000ULL; break;
-    case systime_usec: nsec_factor = 1000ULL; break;
-    case systime_nsec: nsec_factor = 1ULL; break;
-    default: tl_assert(0);
-  }
-  return ((ULong) diff_sec * 1000000000ULL + diff_nsec) / nsec_factor;
+   switch (TG_(clo).collect_systime) {
+   case systime_no:
+      tl_assert(0);
+   case systime_msec:
+      nsec_factor = 1000000ULL;
+      break;
+   case systime_usec:
+      nsec_factor = 1000ULL;
+      break;
+   case systime_nsec:
+      nsec_factor = 1ULL;
+      break;
+   default:
+      tl_assert(0);
+   }
+   return ((ULong)diff_sec * 1000000000ULL + diff_nsec) / nsec_factor;
 }
 
 /* Check if syscall is a fork-like call that creates a new process */
 static Bool is_fork_syscall(UInt syscallno)
 {
 #if defined(VGO_linux)
-   return syscallno == __NR_clone
-       || syscallno == __NR_fork
-       || syscallno == __NR_vfork
-#  if defined(__NR_clone3)
-       || syscallno == __NR_clone3
-#  endif
-       ;
+   return syscallno == __NR_clone || syscallno == __NR_fork ||
+          syscallno == __NR_vfork
+#if defined(__NR_clone3)
+          || syscallno == __NR_clone3
+#endif
+      ;
 #else
-   return False;  /* TODO: support other OSes */
+   return False; /* TODO: support other OSes */
 #endif
 }
 
-static
-void TG_(post_syscall)(ThreadId tid, UInt syscallno,
-                       UWord* args, UInt nArgs, SysRes res)
+static void TG_(post_syscall)(
+   ThreadId tid, UInt syscallno, UWord* args, UInt nArgs, SysRes res)
 {
-  /* Handle fork/clone: emit FORK event with child PID.
-     Skip if this was a thread-creating clone (CLONE_THREAD),
-     since we emit THREAD_CREATE via track_pre_thread_ll_create instead. */
-  if (is_fork_syscall(syscallno) && !sr_isError(res) && sr_Res(res) > 0) {
-    Bool is_thread = False;
+   /* Handle fork/clone: emit FORK event with child PID.
+      Skip if this was a thread-creating clone (CLONE_THREAD),
+      since we emit THREAD_CREATE via track_pre_thread_ll_create instead. */
+   if (is_fork_syscall(syscallno) && !sr_isError(res) && sr_Res(res) > 0) {
+      Bool is_thread = False;
 #if defined(VGO_linux)
-    if (syscallno == __NR_clone && nArgs > 0)
-      is_thread = (args[0] & VKI_CLONE_THREAD) != 0;
-#  if defined(__NR_clone3)
-    if (syscallno == __NR_clone3 && nArgs > 0) {
-      /* clone3 first arg is pointer to struct clone_args;
-         flags is the first field (ULong / __u64). */
-      ULong flags = *(ULong*)(Addr)args[0];
-      is_thread = (flags & VKI_CLONE_THREAD) != 0;
-    }
-#  endif
+      if (syscallno == __NR_clone && nArgs > 0)
+         is_thread = (args[0] & VKI_CLONE_THREAD) != 0;
+#if defined(__NR_clone3)
+      if (syscallno == __NR_clone3 && nArgs > 0) {
+         /* clone3 first arg is pointer to struct clone_args;
+            flags is the first field (ULong / __u64). */
+         ULong flags = *(ULong*)(Addr)args[0];
+         is_thread   = (flags & VKI_CLONE_THREAD) != 0;
+      }
+#endif
 #endif
-    if (!is_thread) {
-      Int child_pid = (Int)sr_Res(res);
-      TG_(trace_emit_fork)(tid, child_pid);
-    }
-  }
-
-  /* Handle systime collection if enabled */
-  if (TG_(clo).collect_systime != systime_no && TG_(current_state).bbcc) {
-    Int o;
-    struct vki_timespec ts_now;
-    struct vki_timespec ts_cpunow;
-    ULong diff;
-
-    collect_time(&ts_now,
-                 TG_(clo).collect_systime == systime_nsec ? &ts_cpunow : NULL);
-
-    diff = vki_timespec_diff (ts_now, syscalltime[tid]);
-
-    /* offset o is for "SysCount", o+1 for "SysTime",
-       o+2 is (optionally) "SysCpuTime".  */
-    o = fullOffset(EG_SYS);
-    TG_ASSERT(o>=0);
-    TG_DEBUG(0,"   Time (Off %d) for Syscall %u: %llu\n", o, syscallno,
-              diff);
-
-    if (!TG_(current_state).bbcc->skipped)
-      TG_(init_cost_lz)(TG_(sets).full,
-			&(TG_(current_state).bbcc->skipped));
-    TG_(current_state).cost[o] ++;
-    TG_(current_state).cost[o+1] += diff;
-    TG_(current_state).bbcc->skipped[o] ++;
-    TG_(current_state).bbcc->skipped[o+1] += diff;
-    if (TG_(clo).collect_systime == systime_nsec) {
-      diff = vki_timespec_diff (ts_cpunow, syscallcputime[tid]);
-      TG_DEBUG(0,"   SysCpuTime (Off %d) for Syscall %u: %llu\n", o+2, syscallno,
-                diff);
-      TG_(current_state).cost[o+2] += diff;
-      TG_(current_state).bbcc->skipped[o+2] += diff;
-    }
-  }
+      if (!is_thread) {
+         Int child_pid = (Int)sr_Res(res);
+         TG_(trace_emit_fork)(tid, child_pid);
+      }
+   }
+
+   /* Handle systime collection if enabled */
+   if (TG_(clo).collect_systime != systime_no && TG_(current_state).bbcc) {
+      Int                 o;
+      struct vki_timespec ts_now;
+      struct vki_timespec ts_cpunow;
+      ULong               diff;
+
+      collect_time(
+         &ts_now, TG_(clo).collect_systime == systime_nsec ? &ts_cpunow : NULL);
+
+      diff = vki_timespec_diff(ts_now, syscalltime[tid]);
+
+      /* offset o is for "SysCount", o+1 for "SysTime",
+         o+2 is (optionally) "SysCpuTime".  */
+      o = fullOffset(EG_SYS);
+      TG_ASSERT(o >= 0);
+      TG_DEBUG(0, "   Time (Off %d) for Syscall %u: %llu\n", o, syscallno,
+               diff);
+
+      if (!TG_(current_state).bbcc->skipped)
+         TG_(init_cost_lz)(TG_(sets).full, &(TG_(current_state).bbcc->skipped));
+      TG_(current_state).cost[o]++;
+      TG_(current_state).cost[o + 1] += diff;
+      TG_(current_state).bbcc->skipped[o]++;
+      TG_(current_state).bbcc->skipped[o + 1] += diff;
+      if (TG_(clo).collect_systime == systime_nsec) {
+         diff = vki_timespec_diff(ts_cpunow, syscallcputime[tid]);
+         TG_DEBUG(0, "   SysCpuTime (Off %d) for Syscall %u: %llu\n", o + 2,
+                  syscallno, diff);
+         TG_(current_state).cost[o + 2] += diff;
+         TG_(current_state).bbcc->skipped[o + 2] += diff;
+      }
+   }
 }
 
 static UInt ULong_width(ULong n)
@@ -1808,195 +1790,180 @@ static UInt ULong_width(ULong n)
       n = n / 10;
       w++;
    }
-   if (w == 0) w = 1;
-   return w + (w-1)/3;   // add space for commas
+   if (w == 0)
+      w = 1;
+   return w + (w - 1) / 3; // add space for commas
 }
 
-static
-void branchsim_printstat(int l1, int l2, int l3)
+static void branchsim_printstat(int l1, int l2, int l3)
 {
-    static HChar fmt[128];    // large enough
-    FullCost total;
-    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
-    ULong B_total_b, B_total_mp;
-
-    total = TG_(total_cost);
-    Bc_total_b  = total[ fullOffset(EG_BC)   ];
-    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
-    Bi_total_b  = total[ fullOffset(EG_BI)   ];
-    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
-
-    /* Make format string, getting width right for numbers */
-    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
-                 l1, l2, l3);
-
-    if (0 == Bc_total_b)  Bc_total_b = 1;
-    if (0 == Bi_total_b)  Bi_total_b = 1;
-    B_total_b  = Bc_total_b  + Bi_total_b;
-    B_total_mp = Bc_total_mp + Bi_total_mp;
-
-    VG_(umsg)("\n");
-    VG_(umsg)(fmt, "Branches:     ",
-              B_total_b, Bc_total_b, Bi_total_b);
-
-    VG_(umsg)(fmt, "Mispredicts:  ",
-              B_total_mp, Bc_total_mp, Bi_total_mp);
-
-    VG_(umsg)("Mispred rate:  %*.1f%% (%*.1f%%     + %*.1f%%   )\n",
-              l1, B_total_mp  * 100.0 / B_total_b,
-              l2, Bc_total_mp * 100.0 / Bc_total_b,
-              l3, Bi_total_mp * 100.0 / Bi_total_b);
+   static HChar fmt[128]; // large enough
+   FullCost     total;
+   ULong        Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
+   ULong        B_total_b, B_total_mp;
+
+   total       = TG_(total_cost);
+   Bc_total_b  = total[fullOffset(EG_BC)];
+   Bc_total_mp = total[fullOffset(EG_BC) + 1];
+   Bi_total_b  = total[fullOffset(EG_BI)];
+   Bi_total_mp = total[fullOffset(EG_BI) + 1];
+
+   /* Make format string, getting width right for numbers */
+   VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n", l1, l2,
+                l3);
+
+   if (0 == Bc_total_b)
+      Bc_total_b = 1;
+   if (0 == Bi_total_b)
+      Bi_total_b = 1;
+   B_total_b  = Bc_total_b + Bi_total_b;
+   B_total_mp = Bc_total_mp + Bi_total_mp;
+
+   VG_(umsg)("\n");
+   VG_(umsg)(fmt, "Branches:     ", B_total_b, Bc_total_b, Bi_total_b);
+
+   VG_(umsg)(fmt, "Mispredicts:  ", B_total_mp, Bc_total_mp, Bi_total_mp);
+
+   VG_(umsg)("Mispred rate:  %*.1f%% (%*.1f%%     + %*.1f%%   )\n", l1,
+             B_total_mp * 100.0 / B_total_b, l2,
+             Bc_total_mp * 100.0 / Bc_total_b, l3,
+             Bi_total_mp * 100.0 / Bi_total_b);
 }
 
-static
-void tg_print_stats(void)
+static void tg_print_stats(void)
 {
-   int BB_lookups =
-     TG_(stat).full_debug_BBs +
-     TG_(stat).fn_name_debug_BBs +
-     TG_(stat).file_line_debug_BBs +
-     TG_(stat).no_debug_BBs;
+   int BB_lookups = TG_(stat).full_debug_BBs + TG_(stat).fn_name_debug_BBs +
+                    TG_(stat).file_line_debug_BBs + TG_(stat).no_debug_BBs;
 
    /* Hash table stats */
-   VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
-		TG_(stat).distinct_objs);
+   VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n", TG_(stat).distinct_objs);
    VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
-		TG_(stat).distinct_files);
-   VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
-		TG_(stat).distinct_fns);
+                TG_(stat).distinct_files);
+   VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n", TG_(stat).distinct_fns);
    VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
-		TG_(stat).distinct_contexts);
-   VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
-		TG_(stat).distinct_bbs);
+                TG_(stat).distinct_contexts);
+   VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n", TG_(stat).distinct_bbs);
    VG_(message)(Vg_DebugMsg, "Cost entries:     %u (Chunks %u)\n",
-		TG_(costarray_entries), TG_(costarray_chunks));
+                TG_(costarray_entries), TG_(costarray_chunks));
    VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
-		TG_(stat).distinct_bbccs);
-   VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
-		TG_(stat).distinct_jccs);
+                TG_(stat).distinct_bbccs);
+   VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n", TG_(stat).distinct_jccs);
    VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
-		TG_(stat).distinct_skips);
-   VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
-		BB_lookups);
-   if (BB_lookups>0) {
+                TG_(stat).distinct_skips);
+   VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n", BB_lookups);
+   if (BB_lookups > 0) {
       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
-		   TG_(stat).full_debug_BBs    * 100 / BB_lookups,
-		   TG_(stat).full_debug_BBs);
+                   TG_(stat).full_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).full_debug_BBs);
       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
-		   TG_(stat).file_line_debug_BBs * 100 / BB_lookups,
-		   TG_(stat).file_line_debug_BBs);
+                   TG_(stat).file_line_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).file_line_debug_BBs);
       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
-		   TG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
-		   TG_(stat).fn_name_debug_BBs);
+                   TG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).fn_name_debug_BBs);
       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
-		   TG_(stat).no_debug_BBs      * 100 / BB_lookups,
-		   TG_(stat).no_debug_BBs);
+                   TG_(stat).no_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).no_debug_BBs);
    }
-   VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
-		TG_(stat).bbcc_clones);
+   VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n", TG_(stat).bbcc_clones);
    VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
-		TG_(stat).bb_retranslations);
+                TG_(stat).bb_retranslations);
    VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
-		TG_(stat).distinct_instrs);
+                TG_(stat).distinct_instrs);
 
    VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
-		TG_(stat).cxt_lru_misses);
+                TG_(stat).cxt_lru_misses);
    VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
-		TG_(stat).bbcc_lru_misses);
+                TG_(stat).bbcc_lru_misses);
    VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
-		TG_(stat).jcc_lru_misses);
+                TG_(stat).jcc_lru_misses);
    VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
-		TG_(stat).bb_executions);
+                TG_(stat).bb_executions);
    VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
-		TG_(stat).call_counter);
+                TG_(stat).call_counter);
    VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
-		TG_(stat).jcnd_counter);
+                TG_(stat).jcnd_counter);
    VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
-		TG_(stat).jump_counter);
+                TG_(stat).jump_counter);
    VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
-		TG_(stat).rec_call_counter);
+                TG_(stat).rec_call_counter);
    VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
-		TG_(stat).ret_counter);
+                TG_(stat).ret_counter);
 }
 
-
-static
-void finish(void)
+static void finish(void)
 {
-  HChar fmt[128];    // large enough
-  Int l1, l2, l3;
-  FullCost total;
-
-  TG_DEBUG(0, "finish()\n");
-
-  (*TG_(cachesim).finish)();
-
-  /* pop all remaining items from CallStack for correct sum
-   */
-  TG_(forall_threads)(unwind_thread);
-
-  TG_(compute_total_cost)();
-
-  /* Close CSV trace output */
-  TG_(trace_close_output)();
-
-  if (VG_(clo_verbosity) == 0) return;
-
-  if (VG_(clo_stats)) {
-    VG_(message)(Vg_DebugMsg, "\n");
-    tg_print_stats();
-    VG_(message)(Vg_DebugMsg, "\n");
-  }
-
-  HChar *evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
-  VG_(message)(Vg_UserMsg, "Events    : %s\n", evmap);
-  VG_(free)(evmap);
-  HChar *mcost = TG_(mappingcost_as_string)(TG_(dumpmap), TG_(total_cost));
-  VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost);
-  VG_(free)(mcost);
-  VG_(message)(Vg_UserMsg, "\n");
-
-  /* determine value widths for statistics */
-  total = TG_(total_cost);
-  l1 = ULong_width( total[fullOffset(EG_IR)] );
-  l2 = l3 = 0;
-  if (TG_(clo).simulate_cache) {
-      l2 = ULong_width( total[fullOffset(EG_DR)] );
-      l3 = ULong_width( total[fullOffset(EG_DW)] );
-  }
-  if (TG_(clo).simulate_branch) {
-      int l2b = ULong_width( total[fullOffset(EG_BC)] );
-      int l3b = ULong_width( total[fullOffset(EG_BI)] );
-      if (l2b > l2) l2 = l2b;
-      if (l3b > l3) l3 = l3b;
-  }
-
-  /* Make format string, getting width right for numbers */
-  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
-
-  /* Always print this */
-  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
-
-  if (TG_(clo).simulate_cache)
-      (*TG_(cachesim).printstat)(l1, l2, l3);
+   HChar    fmt[128]; // large enough
+   Int      l1, l2, l3;
+   FullCost total;
 
-  if (TG_(clo).simulate_branch)
-      branchsim_printstat(l1, l2, l3);
+   TG_DEBUG(0, "finish()\n");
 
-}
+   (*TG_(cachesim).finish)();
+
+   /* pop all remaining items from CallStack for correct sum
+    */
+   TG_(forall_threads)(unwind_thread);
 
+   TG_(compute_total_cost)();
 
-void TG_(fini)(Int exitcode)
-{
-  finish();
+   /* Close CSV trace output */
+   TG_(trace_close_output)();
+
+   if (VG_(clo_verbosity) == 0)
+      return;
+
+   if (VG_(clo_stats)) {
+      VG_(message)(Vg_DebugMsg, "\n");
+      tg_print_stats();
+      VG_(message)(Vg_DebugMsg, "\n");
+   }
+
+   HChar* evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+   VG_(message)(Vg_UserMsg, "Events    : %s\n", evmap);
+   VG_(free)(evmap);
+   HChar* mcost = TG_(mappingcost_as_string)(TG_(dumpmap), TG_(total_cost));
+   VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost);
+   VG_(free)(mcost);
+   VG_(message)(Vg_UserMsg, "\n");
+
+   /* determine value widths for statistics */
+   total = TG_(total_cost);
+   l1    = ULong_width(total[fullOffset(EG_IR)]);
+   l2 = l3 = 0;
+   if (TG_(clo).simulate_cache) {
+      l2 = ULong_width(total[fullOffset(EG_DR)]);
+      l3 = ULong_width(total[fullOffset(EG_DW)]);
+   }
+   if (TG_(clo).simulate_branch) {
+      int l2b = ULong_width(total[fullOffset(EG_BC)]);
+      int l3b = ULong_width(total[fullOffset(EG_BI)]);
+      if (l2b > l2)
+         l2 = l2b;
+      if (l3b > l3)
+         l3 = l3b;
+   }
+
+   /* Make format string, getting width right for numbers */
+   VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
+
+   /* Always print this */
+   VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)]);
+
+   if (TG_(clo).simulate_cache)
+      (*TG_(cachesim).printstat)(l1, l2, l3);
+
+   if (TG_(clo).simulate_branch)
+      branchsim_printstat(l1, l2, l3);
 }
 
+void TG_(fini)(Int exitcode) { finish(); }
 
 /*--------------------------------------------------------------------*/
 /*--- Setup                                                        ---*/
 /*--------------------------------------------------------------------*/
 
-static void tg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
+static void tg_start_client_code_callback(ThreadId tid, ULong blocks_done)
 {
    static ULong last_blocks_done = 0;
 
@@ -2004,40 +1971,37 @@ static void tg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
       VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
 
    /* throttle calls to TG_(run_thread) by number of BBs executed */
-   if (blocks_done - last_blocks_done < 5000) return;
+   if (blocks_done - last_blocks_done < 5000)
+      return;
    last_blocks_done = blocks_done;
 
-   TG_(run_thread)( tid );
+   TG_(run_thread)(tid);
 }
 
 /*
  * Called after fork() in the child process.
  * Reopens the trace file with the child's PID.
  */
-static void tg_atfork_child(ThreadId tid)
-{
-   TG_(trace_reopen_child)();
-}
+static void tg_atfork_child(ThreadId tid) { TG_(trace_reopen_child)(); }
 
 static void tg_pre_thread_ll_create(ThreadId tid, ThreadId child)
 {
-    /* Skip Valgrind's internal scheduler thread (tid 0) creating the
-       initial client thread -- that's not a user-visible thread creation. */
-    if (tid == 0) return;
-    TG_(trace_emit_thread_create)(tid, child);
+   /* Skip Valgrind's internal scheduler thread (tid 0) creating the
+      initial client thread -- that's not a user-visible thread creation. */
+   if (tid == 0)
+      return;
+   TG_(trace_emit_thread_create)(tid, child);
 }
 
-static
-void TG_(post_clo_init)(void)
+static void TG_(post_clo_init)(void)
 {
-   if (VG_(clo_vex_control).iropt_register_updates_default
-       != VexRegUpdSpAtMemAccess) {
+   if (VG_(clo_vex_control).iropt_register_updates_default !=
+       VexRegUpdSpAtMemAccess) {
       TG_DEBUG(1, " Using user specified value for "
-                "--vex-iropt-register-updates\n");
+                  "--vex-iropt-register-updates\n");
    } else {
-      TG_DEBUG(1,
-                " Using default --vex-iropt-register-updates="
-                "sp-at-mem-access\n");
+      TG_DEBUG(1, " Using default --vex-iropt-register-updates="
+                  "sp-at-mem-access\n");
    }
 
    /* Always register syscall wrappers for fork/clone detection.
@@ -2045,17 +2009,17 @@ void TG_(post_clo_init)(void)
    VG_(needs_syscall_wrapper)(TG_(pre_syscall), TG_(post_syscall));
 
    if (TG_(clo).collect_systime != systime_no) {
-      syscalltime = TG_MALLOC("cl.main.pci.1",
-                               VG_N_THREADS * sizeof syscalltime[0]);
+      syscalltime =
+         TG_MALLOC("cl.main.pci.1", VG_N_THREADS * sizeof syscalltime[0]);
       for (UInt i = 0; i < VG_N_THREADS; ++i) {
-         syscalltime[i].tv_sec = 0;
+         syscalltime[i].tv_sec  = 0;
          syscalltime[i].tv_nsec = 0;
       }
       if (TG_(clo).collect_systime == systime_nsec) {
-         syscallcputime = TG_MALLOC("cl.main.pci.2",
-                                     VG_N_THREADS * sizeof syscallcputime[0]);
+         syscallcputime =
+            TG_MALLOC("cl.main.pci.2", VG_N_THREADS * sizeof syscallcputime[0]);
          for (UInt i = 0; i < VG_N_THREADS; ++i) {
-            syscallcputime[i].tv_sec = 0;
+            syscallcputime[i].tv_sec  = 0;
             syscallcputime[i].tv_nsec = 0;
          }
       }
@@ -2063,18 +2027,17 @@ void TG_(post_clo_init)(void)
 
    if (VG_(clo_px_file_backed) != VexRegUpdSpAtMemAccess) {
       TG_DEBUG(1, " Using user specified value for "
-                "--px-file-backed\n");
+                  "--px-file-backed\n");
    } else {
-      TG_DEBUG(1,
-                " Using default --px-file-backed="
-                "sp-at-mem-access\n");
+      TG_DEBUG(1, " Using default --px-file-backed="
+                  "sp-at-mem-access\n");
    }
 
    if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
       VG_(message)(Vg_UserMsg,
                    "tracegrind only works with --vex-iropt-unroll-thresh=0\n"
                    "=> resetting it back to 0\n");
-      VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
+      VG_(clo_vex_control).iropt_unroll_thresh = 0; // cannot be overridden.
    }
    if (VG_(clo_vex_control).guest_chase) {
       VG_(message)(Vg_UserMsg,
@@ -2083,15 +2046,16 @@ void TG_(post_clo_init)(void)
       VG_(clo_vex_control).guest_chase = False; // cannot be overridden.
    }
 
-   TG_DEBUG(1, "  dump threads: %s\n", TG_(clo).separate_threads ? "Yes":"No");
+   TG_DEBUG(1, "  dump threads: %s\n",
+            TG_(clo).separate_threads ? "Yes" : "No");
    TG_DEBUG(1, "  call sep. : %d\n", TG_(clo).separate_callers);
    TG_DEBUG(1, "  rec. sep. : %d\n", TG_(clo).separate_recursions);
 
    (*TG_(cachesim).post_clo_init)();
 
    TG_(init_eventsets)();
-   TG_(init_statistics)(& TG_(stat));
-   TG_(init_cost_lz)( TG_(sets).full, &TG_(total_cost) );
+   TG_(init_statistics)(&TG_(stat));
+   TG_(init_cost_lz)(TG_(sets).full, &TG_(total_cost));
 
    /* initialize hash tables */
    TG_(init_obj_table)();
@@ -2110,52 +2074,46 @@ void TG_(post_clo_init)(void)
    VG_(atfork)(NULL, NULL, tg_atfork_child);
 
    if (VG_(clo_verbosity) > 0) {
-      VG_(message)(Vg_UserMsg,
-                   "Streaming trace output to tracegrind.out.%d\n",
+      VG_(message)(Vg_UserMsg, "Streaming trace output to tracegrind.out.%d\n",
                    VG_(getpid)());
    }
 }
 
-static
-void TG_(pre_clo_init)(void)
+static void TG_(pre_clo_init)(void)
 {
-    VG_(details_name)            ("Tracegrind");
-    VG_(details_version)         (NULL);
-    VG_(details_description)     ("a streaming trace cache profiler");
-    VG_(details_copyright_author)("Copyright (C) 2026, and GNU GPL'd, "
-				  "by CodSpeed Technology SAS. "
-				  "Based on Callgrind by Josef Weidendorfer et al.");
-    VG_(details_bug_reports_to)  (VG_BUGS_TO);
-    VG_(details_avg_translation_sizeB) ( 500 );
-
-    VG_(clo_vex_control).iropt_register_updates_default
-       = VG_(clo_px_file_backed)
-       = VexRegUpdSpAtMemAccess; // overridable by the user.
-
-    VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
-    VG_(clo_vex_control).guest_chase = False;       // cannot be overridden.
+   VG_(details_name)("Tracegrind");
+   VG_(details_version)(NULL);
+   VG_(details_description)("a streaming trace cache profiler");
+   VG_(details_copyright_author)(
+      "Copyright (C) 2026, and GNU GPL'd, "
+      "by CodSpeed Technology SAS. "
+      "Based on Callgrind by Josef Weidendorfer et al.");
+   VG_(details_bug_reports_to)(VG_BUGS_TO);
+   VG_(details_avg_translation_sizeB)(500);
 
-    VG_(basic_tool_funcs)        (TG_(post_clo_init),
-                                  TG_(instrument),
-                                  TG_(fini));
+   VG_(clo_vex_control).iropt_register_updates_default =
+      VG_(clo_px_file_backed) =
+         VexRegUpdSpAtMemAccess; // overridable by the user.
 
-    VG_(needs_superblock_discards)(tg_discard_superblock_info);
+   VG_(clo_vex_control).iropt_unroll_thresh = 0;     // cannot be overridden.
+   VG_(clo_vex_control).guest_chase         = False; // cannot be overridden.
 
+   VG_(basic_tool_funcs)(TG_(post_clo_init), TG_(instrument), TG_(fini));
 
-    VG_(needs_command_line_options)(TG_(process_cmd_line_option),
-				    TG_(print_usage),
-				    TG_(print_debug_usage));
+   VG_(needs_superblock_discards)(tg_discard_superblock_info);
 
-    VG_(needs_client_requests)(TG_(handle_client_request));
-    VG_(needs_print_stats)    (tg_print_stats);
+   VG_(needs_command_line_options)(TG_(process_cmd_line_option),
+                                   TG_(print_usage), TG_(print_debug_usage));
 
-    VG_(track_start_client_code)  ( & tg_start_client_code_callback );
-    VG_(track_pre_deliver_signal) ( & TG_(pre_signal) );
-    VG_(track_post_deliver_signal)( & TG_(post_signal) );
-    VG_(track_pre_thread_ll_create)( & tg_pre_thread_ll_create );
+   VG_(needs_client_requests)(TG_(handle_client_request));
+   VG_(needs_print_stats)(tg_print_stats);
 
-    TG_(set_clo_defaults)();
+   VG_(track_start_client_code)(&tg_start_client_code_callback);
+   VG_(track_pre_deliver_signal)(&TG_(pre_signal));
+   VG_(track_post_deliver_signal)(&TG_(post_signal));
+   VG_(track_pre_thread_ll_create)(&tg_pre_thread_ll_create);
 
+   TG_(set_clo_defaults)();
 }
 
 VG_DETERMINE_INTERFACE_VERSION(TG_(pre_clo_init))
diff --git a/tracegrind/sim.c b/tracegrind/sim.c
index d1393d00f..25d8cf983 100644
--- a/tracegrind/sim.c
+++ b/tracegrind/sim.c
@@ -30,7 +30,6 @@
 
 #include "global.h"
 
-
 /* Notes:
   - simulates a write-allocate cache
   - (block --> set) hash function uses simple bit selection
@@ -46,38 +45,38 @@
 
 /* additional structures for cache use info, separated
  * according usage frequency:
- * - line_loaded : pointer to cost center of instruction 
+ * - line_loaded : pointer to cost center of instruction
  *                 which loaded the line into cache.
  *                 Needed to increment counters when line is evicted.
  * - line_use    : updated on every access
  */
 typedef struct {
-  UInt count;
-  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
+   UInt count;
+   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
 } line_use;
 
 typedef struct {
-  Addr memline, iaddr;
-  line_use* dep_use; /* point to higher-level cacheblock for this memline */
-  ULong* use_base;
-} line_loaded;  
+   Addr      memline, iaddr;
+   line_use* dep_use; /* point to higher-level cacheblock for this memline */
+   ULong*    use_base;
+} line_loaded;
 
 /* Cache state */
 typedef struct {
    const HChar* name;
-   int          size;                   /* bytes */
+   int          size; /* bytes */
    int          assoc;
-   int          line_size;              /* bytes */
+   int          line_size; /* bytes */
    Bool         sectored;  /* prefetch nearside cacheline on read */
    int          sets;
    int          sets_min_1;
    int          line_size_bits;
    int          tag_shift;
    UWord        tag_mask;
-   HChar        desc_line[128];    // large enough
+   HChar        desc_line[128]; // large enough
    UWord*       tags;
 
-  /* for cache use */
+   /* for cache use */
    int          line_size_mask;
    int*         line_start_mask;
    int*         line_end_mask;
@@ -87,20 +86,19 @@ typedef struct {
 
 /*
  * States of flat caches in our model.
- * We use a 2-level hierarchy, 
+ * We use a 2-level hierarchy,
  */
 static cache_t2 I1, D1, LL;
 
 /* Lower bits of cache tags are used as flags for a cache line */
-#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
+#define CACHELINE_FLAGMASK (MIN_LINE_SIZE - 1)
 #define CACHELINE_DIRTY    1
 
-
 /* Cache simulator Options */
 static Bool clo_simulate_writeback = False;
-static Bool clo_simulate_hwpref = False;
-static Bool clo_simulate_sectors = False;
-static Bool clo_collect_cacheuse = False;
+static Bool clo_simulate_hwpref    = False;
+static Bool clo_simulate_sectors   = False;
+static Bool clo_collect_cacheuse   = False;
 
 /* Following global vars are setup before by setup_bbcc():
  *
@@ -117,32 +115,28 @@ static InstrInfo* current_ii;
 /* The offsets are only correct because all per-instruction event sets get
  * the "Use" set added first !
  */
-static Int off_I1_AcCost  = 0;
-static Int off_I1_SpLoss  = 1;
-static Int off_D1_AcCost  = 0;
-static Int off_D1_SpLoss  = 1;
-static Int off_LL_AcCost  = 2;
-static Int off_LL_SpLoss  = 3;
+static Int off_I1_AcCost = 0;
+static Int off_I1_SpLoss = 1;
+static Int off_D1_AcCost = 0;
+static Int off_D1_SpLoss = 1;
+static Int off_LL_AcCost = 2;
+static Int off_LL_SpLoss = 3;
 
 /* Cache access types */
 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
 
 /* Result of a reference into a flat cache */
-typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
+typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
 
 /* Result of a reference into a hierarchical cache model */
-typedef enum {
-    L1_Hit, 
-    LL_Hit,
-    MemAccess,
-    WriteBackMemAccess } CacheModelResult;
+typedef enum { L1_Hit, LL_Hit, MemAccess, WriteBackMemAccess } CacheModelResult;
 
 typedef CacheModelResult (*simcall_type)(Addr, UChar);
 
 static struct {
-    simcall_type I1_Read;
-    simcall_type D1_Read;
-    simcall_type D1_Write;
+   simcall_type I1_Read;
+   simcall_type D1_Read;
+   simcall_type D1_Write;
 } simulator;
 
 /*------------------------------------------------------------*/
@@ -151,21 +145,21 @@ static struct {
 
 static void cachesim_clearcache(cache_t2* c)
 {
-  Int i;
-
-  for (i = 0; i < c->sets * c->assoc; i++)
-    c->tags[i] = 0;
-  if (c->use) {
-    for (i = 0; i < c->sets * c->assoc; i++) {
-      c->loaded[i].memline  = 0;
-      c->loaded[i].use_base = 0;
-      c->loaded[i].dep_use = 0;
-      c->loaded[i].iaddr = 0;
-      c->use[i].mask    = 0;
-      c->use[i].count   = 0;
-      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
-    }
-  }
+   Int i;
+
+   for (i = 0; i < c->sets * c->assoc; i++)
+      c->tags[i] = 0;
+   if (c->use) {
+      for (i = 0; i < c->sets * c->assoc; i++) {
+         c->loaded[i].memline  = 0;
+         c->loaded[i].use_base = 0;
+         c->loaded[i].dep_use  = 0;
+         c->loaded[i].iaddr    = 0;
+         c->use[i].mask        = 0;
+         c->use[i].count       = 0;
+         c->tags[i]            = i % c->assoc; /* init lower bits as pointer */
+      }
+   }
 }
 
 static void cacheuse_initcache(cache_t2* c);
@@ -181,33 +175,30 @@ static void cachesim_initcache(cache_t config, cache_t2* c)
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
    c->line_size_bits = VG_(log2)(c->line_size);
-   c->tag_shift     = c->line_size_bits + VG_(log2)(c->sets);
-   c->tag_mask       = ~((1u<<c->tag_shift)-1);
+   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
+   c->tag_mask       = ~((1u << c->tag_shift) - 1);
 
    /* Can bits in tag entries be used for flags?
     * Should be always true as MIN_LINE_SIZE >= 16 */
-   TG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
+   TG_ASSERT((c->tag_mask & CACHELINE_FLAGMASK) == 0);
 
    if (c->assoc == 1) {
-      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
-		   c->size, c->line_size,
-		   c->sectored ? ", sectored":"");
+      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s", c->size,
+                   c->line_size, c->sectored ? ", sectored" : "");
    } else {
-      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
-		   c->size, c->line_size, c->assoc,
-		   c->sectored ? ", sectored":"");
+      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s", c->size,
+                   c->line_size, c->assoc, c->sectored ? ", sectored" : "");
    }
 
-   c->tags = (UWord*) TG_MALLOC("cl.sim.cs_ic.1",
-                                 sizeof(UWord) * c->sets * c->assoc);
+   c->tags =
+      (UWord*)TG_MALLOC("cl.sim.cs_ic.1", sizeof(UWord) * c->sets * c->assoc);
    if (clo_collect_cacheuse)
-       cacheuse_initcache(c);
+      cacheuse_initcache(c);
    else
-     c->use = 0;
+      c->use = 0;
    cachesim_clearcache(c);
 }
 
-
 #if 0
 static void print_cache(cache_t2* c)
 {
@@ -221,8 +212,7 @@ static void print_cache(cache_t2* c)
       VG_(printf)("\n");
    }
 }
-#endif 
-
+#endif
 
 /*------------------------------------------------------------*/
 /*--- Simple Cache Simulation                              ---*/
@@ -241,92 +231,91 @@ static void print_cache(cache_t2* c)
  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
  */
-__attribute__((always_inline))
-static __inline__
-CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
 {
-    int i, j;
-    UWord *set;
-
-    set = &(c->tags[set_no * c->assoc]);
-
-    /* This loop is unrolled for just the first case, which is the most */
-    /* common.  We can't unroll any further because it would screw up   */
-    /* if we have a direct-mapped (1-way) cache.                        */
-    if (tag == set[0])
-        return Hit;
-
-    /* If the tag is one other than the MRU, move it into the MRU spot  */
-    /* and shuffle the rest down.                                       */
-    for (i = 1; i < c->assoc; i++) {
-        if (tag == set[i]) {
-            for (j = i; j > 0; j--) {
-                set[j] = set[j - 1];
-            }
-            set[0] = tag;
-            return Hit;
-        }
-    }
-
-    /* A miss;  install this tag as MRU, shuffle rest down. */
-    for (j = c->assoc - 1; j > 0; j--) {
-        set[j] = set[j - 1];
-    }
-    set[0] = tag;
-
-    return Miss;
+   int    i, j;
+   UWord* set;
+
+   set = &(c->tags[set_no * c->assoc]);
+
+   /* This loop is unrolled for just the first case, which is the most */
+   /* common.  We can't unroll any further because it would screw up   */
+   /* if we have a direct-mapped (1-way) cache.                        */
+   if (tag == set[0])
+      return Hit;
+
+   /* If the tag is one other than the MRU, move it into the MRU spot  */
+   /* and shuffle the rest down.                                       */
+   for (i = 1; i < c->assoc; i++) {
+      if (tag == set[i]) {
+         for (j = i; j > 0; j--) {
+            set[j] = set[j - 1];
+         }
+         set[0] = tag;
+         return Hit;
+      }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   for (j = c->assoc - 1; j > 0; j--) {
+      set[j] = set[j - 1];
+   }
+   set[0] = tag;
+
+   return Miss;
 }
 
-__attribute__((always_inline))
-static __inline__
-CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_ref(cache_t2* c, Addr a, UChar size)
 {
-    UWord block1 =  a         >> c->line_size_bits;
-    UWord block2 = (a+size-1) >> c->line_size_bits;
-    UInt  set1   = block1 & c->sets_min_1;
-    /* the tag does not need to include bits specifying the set,
-     * but it can, and this saves instructions */
-    UWord tag1   = block1;
-
-    /* Access entirely within line. */
-    if (block1 == block2)
-	return cachesim_setref(c, set1, tag1);
-
-    /* Access straddles two lines. */
-    else if (block1 + 1 == block2) {
-        UInt  set2 = block2 & c->sets_min_1;
-        UWord tag2 = block2;
-
-	/* the call updates cache structures as side effect */
-	CacheResult res1 =  cachesim_setref(c, set1, tag1);
-	CacheResult res2 =  cachesim_setref(c, set2, tag2);
-	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+   UWord block1 = a >> c->line_size_bits;
+   UWord block2 = (a + size - 1) >> c->line_size_bits;
+   UInt  set1   = block1 & c->sets_min_1;
+   /* the tag does not need to include bits specifying the set,
+    * but it can, and this saves instructions */
+   UWord tag1 = block1;
+
+   /* Access entirely within line. */
+   if (block1 == block2)
+      return cachesim_setref(c, set1, tag1);
+
+   /* Access straddles two lines. */
+   else if (block1 + 1 == block2) {
+      UInt  set2 = block2 & c->sets_min_1;
+      UWord tag2 = block2;
+
+      /* the call updates cache structures as side effect */
+      CacheResult res1 = cachesim_setref(c, set1, tag1);
+      CacheResult res2 = cachesim_setref(c, set2, tag2);
+      return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 
    } else {
-       VG_(printf)("addr: %lx  size: %u  blocks: %lu %lu",
-		   a, size, block1, block2);
-       VG_(tool_panic)("item straddles more than two cache sets");
+      VG_(printf)("addr: %lx  size: %u  blocks: %lu %lu", a, size, block1,
+                  block2);
+      VG_(tool_panic)("item straddles more than two cache sets");
    }
    return Hit;
 }
 
-static
-CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+static CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 {
-    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
-    return MemAccess;
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
 }
 
-static
-CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+static CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 {
-    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
-    return MemAccess;
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
 }
 
-
 /*------------------------------------------------------------*/
 /*--- Write Back Cache Simulation                          ---*/
 /*------------------------------------------------------------*/
@@ -347,126 +336,131 @@ CacheModelResult cachesim_D1_ref(Addr a, UChar size)
  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
  * type (Read/Write), the line gets dirty on a write.
  */
-__attribute__((always_inline))
-static __inline__
-CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
 {
-    int i, j;
-    UWord *set, tmp_tag;
-
-    set = &(c->tags[set_no * c->assoc]);
-
-    /* This loop is unrolled for just the first case, which is the most */
-    /* common.  We can't unroll any further because it would screw up   */
-    /* if we have a direct-mapped (1-way) cache.                        */
-    if (tag == (set[0] & ~CACHELINE_DIRTY)) {
-	set[0] |= ref;
-        return Hit;
-    }
-    /* If the tag is one other than the MRU, move it into the MRU spot  */
-    /* and shuffle the rest down.                                       */
-    for (i = 1; i < c->assoc; i++) {
-	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
-	    tmp_tag = set[i] | ref; // update dirty flag
-            for (j = i; j > 0; j--) {
-                set[j] = set[j - 1];
-            }
-            set[0] = tmp_tag;
-            return Hit;
-        }
-    }
-
-    /* A miss;  install this tag as MRU, shuffle rest down. */
-    tmp_tag = set[c->assoc - 1];
-    for (j = c->assoc - 1; j > 0; j--) {
-        set[j] = set[j - 1];
-    }
-    set[0] = tag | ref;
-
-    return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
+   int    i, j;
+   UWord *set, tmp_tag;
+
+   set = &(c->tags[set_no * c->assoc]);
+
+   /* This loop is unrolled for just the first case, which is the most */
+   /* common.  We can't unroll any further because it would screw up   */
+   /* if we have a direct-mapped (1-way) cache.                        */
+   if (tag == (set[0] & ~CACHELINE_DIRTY)) {
+      set[0] |= ref;
+      return Hit;
+   }
+   /* If the tag is one other than the MRU, move it into the MRU spot  */
+   /* and shuffle the rest down.                                       */
+   for (i = 1; i < c->assoc; i++) {
+      if (tag == (set[i] & ~CACHELINE_DIRTY)) {
+         tmp_tag = set[i] | ref; // update dirty flag
+         for (j = i; j > 0; j--) {
+            set[j] = set[j - 1];
+         }
+         set[0] = tmp_tag;
+         return Hit;
+      }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   tmp_tag = set[c->assoc - 1];
+   for (j = c->assoc - 1; j > 0; j--) {
+      set[j] = set[j - 1];
+   }
+   set[0] = tag | ref;
+
+   return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
 }
 
-__attribute__((always_inline))
-static __inline__
-CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
 {
-    UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
-    UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
-    UWord tag = a & c->tag_mask;
+   UInt  set1 = (a >> c->line_size_bits) & (c->sets_min_1);
+   UInt  set2 = ((a + size - 1) >> c->line_size_bits) & (c->sets_min_1);
+   UWord tag  = a & c->tag_mask;
 
-    /* Access entirely within line. */
-    if (set1 == set2)
-	return cachesim_setref_wb(c, ref, set1, tag);
+   /* Access entirely within line. */
+   if (set1 == set2)
+      return cachesim_setref_wb(c, ref, set1, tag);
 
-    /* Access straddles two lines. */
-    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
-    else if (((set1 + 1) & (c->sets_min_1)) == set2) {
-	UWord tag2  = (a+size-1) & c->tag_mask;
+   /* Access straddles two lines. */
+   /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
+   else if (((set1 + 1) & (c->sets_min_1)) == set2) {
+      UWord tag2 = (a + size - 1) & c->tag_mask;
 
-	/* the call updates cache structures as side effect */
-	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
-	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
+      /* the call updates cache structures as side effect */
+      CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
+      CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
 
-	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
-	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+      if ((res1 == MissDirty) || (res2 == MissDirty))
+         return MissDirty;
+      return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 
    } else {
-       VG_(printf)("addr: %lx  size: %u  sets: %u %u", a, size, set1, set2);
-       VG_(tool_panic)("item straddles more than two cache sets");
+      VG_(printf)("addr: %lx  size: %u  sets: %u %u", a, size, set1, set2);
+      VG_(tool_panic)("item straddles more than two cache sets");
    }
    return Hit;
 }
 
-
-static
-CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+static CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 {
-    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
-	case Hit: return LL_Hit;
-	case Miss: return MemAccess;
-	default: break;
-    }
-    return WriteBackMemAccess;
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
 }
 
-static
-CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+static CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 {
-    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
-	case Hit: return LL_Hit;
-	case Miss: return MemAccess;
-	default: break;
-    }
-    return WriteBackMemAccess;
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
 }
 
-static
-CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+static CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 {
-    if ( cachesim_ref( &D1, a, size) == Hit ) {
-	/* Even for a L1 hit, the write-trough L1 passes
-	 * the write to the LL to make the LL line dirty.
-	 * But this causes no latency, so return the hit.
-	 */
-	cachesim_ref_wb( &LL, Write, a, size);
-	return L1_Hit;
-    }
-    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
-	case Hit: return LL_Hit;
-	case Miss: return MemAccess;
-	default: break;
-    }
-    return WriteBackMemAccess;
+   if (cachesim_ref(&D1, a, size) == Hit) {
+      /* Even for a L1 hit, the write-trough L1 passes
+       * the write to the LL to make the LL line dirty.
+       * But this causes no latency, so return the hit.
+       */
+      cachesim_ref_wb(&LL, Write, a, size);
+      return L1_Hit;
+   }
+   switch (cachesim_ref_wb(&LL, Write, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
 }
 
-
 /*------------------------------------------------------------*/
 /*--- Hardware Prefetch Simulation                         ---*/
 /*------------------------------------------------------------*/
 
-static ULong prefetch_up = 0;
+static ULong prefetch_up   = 0;
 static ULong prefetch_down = 0;
 
 #define PF_STREAMS  8
@@ -475,12 +469,11 @@ static ULong prefetch_down = 0;
 static UInt pf_lastblock[PF_STREAMS];
 static Int  pf_seqblocks[PF_STREAMS];
 
-static
-void prefetch_clear(void)
+static void prefetch_clear(void)
 {
-  int i;
-  for(i=0;i<PF_STREAMS;i++)
-    pf_lastblock[i] = pf_seqblocks[i] = 0;
+   int i;
+   for (i = 0; i < PF_STREAMS; i++)
+      pf_lastblock[i] = pf_seqblocks[i] = 0;
 }
 
 /*
@@ -488,440 +481,454 @@ void prefetch_clear(void)
  * Start prefetching when detecting sequential access to 3 memory blocks.
  * One stream can be detected per 4k page.
  */
-static __inline__
-void prefetch_LL_doref(Addr a)
+static __inline__ void prefetch_LL_doref(Addr a)
 {
-  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
-  UInt block = ( a >> LL.line_size_bits);
-
-  if (block != pf_lastblock[stream]) {
-    if (pf_seqblocks[stream] == 0) {
-      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
-      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
-    }
-    else if (pf_seqblocks[stream] >0) {
-      if (pf_lastblock[stream] +1 == block) {
-	pf_seqblocks[stream]++;
-	if (pf_seqblocks[stream] >= 2) {
-	  prefetch_up++;
-	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
-	}
-      }
-      else pf_seqblocks[stream] = 0;
-    }
-    else if (pf_seqblocks[stream] <0) {
-      if (pf_lastblock[stream] -1 == block) {
-	pf_seqblocks[stream]--;
-	if (pf_seqblocks[stream] <= -2) {
-	  prefetch_down++;
-	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
-	}
+   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
+   UInt block  = (a >> LL.line_size_bits);
+
+   if (block != pf_lastblock[stream]) {
+      if (pf_seqblocks[stream] == 0) {
+         if (pf_lastblock[stream] + 1 == block)
+            pf_seqblocks[stream]++;
+         else if (pf_lastblock[stream] - 1 == block)
+            pf_seqblocks[stream]--;
+      } else if (pf_seqblocks[stream] > 0) {
+         if (pf_lastblock[stream] + 1 == block) {
+            pf_seqblocks[stream]++;
+            if (pf_seqblocks[stream] >= 2) {
+               prefetch_up++;
+               cachesim_ref(&LL, a + 5 * LL.line_size, 1);
+            }
+         } else
+            pf_seqblocks[stream] = 0;
+      } else if (pf_seqblocks[stream] < 0) {
+         if (pf_lastblock[stream] - 1 == block) {
+            pf_seqblocks[stream]--;
+            if (pf_seqblocks[stream] <= -2) {
+               prefetch_down++;
+               cachesim_ref(&LL, a - 5 * LL.line_size, 1);
+            }
+         } else
+            pf_seqblocks[stream] = 0;
       }
-      else pf_seqblocks[stream] = 0;
-    }
-    pf_lastblock[stream] = block;
-  }
-}  
+      pf_lastblock[stream] = block;
+   }
+}
 
 /* simple model with hardware prefetch */
 
-static
-CacheModelResult prefetch_I1_ref(Addr a, UChar size)
+static CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 {
-    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    prefetch_LL_doref(a);
-    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
-    return MemAccess;
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
 }
 
-static
-CacheModelResult prefetch_D1_ref(Addr a, UChar size)
+static CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 {
-    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    prefetch_LL_doref(a);
-    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
-    return MemAccess;
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
 }
 
-
 /* complex model with hardware prefetch */
 
-static
-CacheModelResult prefetch_I1_Read(Addr a, UChar size)
+static CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 {
-    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    prefetch_LL_doref(a);
-    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
-	case Hit: return LL_Hit;
-	case Miss: return MemAccess;
-	default: break;
-    }
-    return WriteBackMemAccess;
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
 }
 
-static
-CacheModelResult prefetch_D1_Read(Addr a, UChar size)
+static CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 {
-    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    prefetch_LL_doref(a);
-    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
-	case Hit: return LL_Hit;
-	case Miss: return MemAccess;
-	default: break;
-    }
-    return WriteBackMemAccess;
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
 }
 
-static
-CacheModelResult prefetch_D1_Write(Addr a, UChar size)
+static CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 {
-    prefetch_LL_doref(a);
-    if ( cachesim_ref( &D1, a, size) == Hit ) {
-	/* Even for a L1 hit, the write-trough L1 passes
-	 * the write to the LL to make the LL line dirty.
-	 * But this causes no latency, so return the hit.
-	 */
-	cachesim_ref_wb( &LL, Write, a, size);
-	return L1_Hit;
-    }
-    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
-	case Hit: return LL_Hit;
-	case Miss: return MemAccess;
-	default: break;
-    }
-    return WriteBackMemAccess;
+   prefetch_LL_doref(a);
+   if (cachesim_ref(&D1, a, size) == Hit) {
+      /* Even for a L1 hit, the write-trough L1 passes
+       * the write to the LL to make the LL line dirty.
+       * But this causes no latency, so return the hit.
+       */
+      cachesim_ref_wb(&LL, Write, a, size);
+      return L1_Hit;
+   }
+   switch (cachesim_ref_wb(&LL, Write, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
 }
 
-
 /*------------------------------------------------------------*/
 /*--- Cache Simulation with use metric collection          ---*/
 /*------------------------------------------------------------*/
 
 /* can not be combined with write-back or prefetch */
 
-static
-void cacheuse_initcache(cache_t2* c)
+static void cacheuse_initcache(cache_t2* c)
 {
-    int i;
-    unsigned int start_mask, start_val;
-    unsigned int end_mask, end_val;
-
-    c->use    = TG_MALLOC("cl.sim.cu_ic.1",
-                           sizeof(line_use) * c->sets * c->assoc);
-    c->loaded = TG_MALLOC("cl.sim.cu_ic.2",
-                           sizeof(line_loaded) * c->sets * c->assoc);
-    c->line_start_mask = TG_MALLOC("cl.sim.cu_ic.3",
-                                    sizeof(int) * c->line_size);
-    c->line_end_mask = TG_MALLOC("cl.sim.cu_ic.4",
-                                  sizeof(int) * c->line_size);
-    
-    c->line_size_mask = c->line_size-1;
-
-    /* Meaning of line_start_mask/line_end_mask
-     * Example: for a given cache line, you get an access starting at
-     * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
-     * line size of 32, you have 1 bit per byte in the mask:
-     *
-     *   bit31   bit8 bit5  bit 0
-     *       |      |  |    |
-     *       11..111111100000   line_start_mask[5]
-     *       00..000111111111   line_end_mask[(5+4)-1]
-     *
-     *  use_mask |= line_start_mask[5] && line_end_mask[8]
-     *
-     */
-    start_val = end_val = ~0;
-    if (c->line_size < 32) {
-	int bits_per_byte = 32/c->line_size;
-	start_mask = (1<<bits_per_byte)-1;
-	end_mask   = start_mask << (32-bits_per_byte);
-	for(i=0;i<c->line_size;i++) {
-	    c->line_start_mask[i] = start_val;
-	    start_val  = start_val & ~start_mask;
-	    start_mask = start_mask << bits_per_byte;
-	    
-	    c->line_end_mask[c->line_size-i-1] = end_val;
-	    end_val  = end_val & ~end_mask;
-	    end_mask = end_mask >> bits_per_byte;
-	}
-    }
-    else {
-	int bytes_per_bit = c->line_size/32;
-	start_mask = 1;
-	end_mask   = 1u << 31;
-	for(i=0;i<c->line_size;i++) {
-	    c->line_start_mask[i] = start_val;
-	    c->line_end_mask[c->line_size-i-1] = end_val;
-	    if ( ((i+1)%bytes_per_bit) == 0) {
-		start_val   &= ~start_mask;
-		end_val     &= ~end_mask;
-		start_mask <<= 1;
-		end_mask   >>= 1;
-	    }
-	}
-    }
-    
-    TG_DEBUG(6, "Config %s:\n", c->desc_line);
-    for(i=0;i<c->line_size;i++) {
-	TG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
-		  i, (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
-    }
-    
-    /* We use lower tag bits as offset pointers to cache use info.
-     * I.e. some cache parameters don't work.
-     */
-    if ( (1<<c->tag_shift) < c->assoc) {
-	VG_(message)(Vg_DebugMsg,
-		     "error: Use associativity < %d for cache use statistics!\n",
-		     (1<<c->tag_shift) );
-	VG_(tool_panic)("Unsupported cache configuration");
-    }
-}
-    
+   int          i;
+   unsigned int start_mask, start_val;
+   unsigned int end_mask, end_val;
+
+   c->use = TG_MALLOC("cl.sim.cu_ic.1", sizeof(line_use) * c->sets * c->assoc);
+   c->loaded =
+      TG_MALLOC("cl.sim.cu_ic.2", sizeof(line_loaded) * c->sets * c->assoc);
+   c->line_start_mask = TG_MALLOC("cl.sim.cu_ic.3", sizeof(int) * c->line_size);
+   c->line_end_mask   = TG_MALLOC("cl.sim.cu_ic.4", sizeof(int) * c->line_size);
+
+   c->line_size_mask = c->line_size - 1;
+
+   /* Meaning of line_start_mask/line_end_mask
+    * Example: for a given cache line, you get an access starting at
+    * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
+    * line size of 32, you have 1 bit per byte in the mask:
+    *
+    *   bit31   bit8 bit5  bit 0
+    *       |      |  |    |
+    *       11..111111100000   line_start_mask[5]
+    *       00..000111111111   line_end_mask[(5+4)-1]
+    *
+    *  use_mask |= line_start_mask[5] && line_end_mask[8]
+    *
+    */
+   start_val = end_val = ~0;
+   if (c->line_size < 32) {
+      int bits_per_byte = 32 / c->line_size;
+      start_mask        = (1 << bits_per_byte) - 1;
+      end_mask          = start_mask << (32 - bits_per_byte);
+      for (i = 0; i < c->line_size; i++) {
+         c->line_start_mask[i] = start_val;
+         start_val             = start_val & ~start_mask;
+         start_mask            = start_mask << bits_per_byte;
+
+         c->line_end_mask[c->line_size - i - 1] = end_val;
+         end_val                                = end_val & ~end_mask;
+         end_mask                               = end_mask >> bits_per_byte;
+      }
+   } else {
+      int bytes_per_bit = c->line_size / 32;
+      start_mask        = 1;
+      end_mask          = 1u << 31;
+      for (i = 0; i < c->line_size; i++) {
+         c->line_start_mask[i]                  = start_val;
+         c->line_end_mask[c->line_size - i - 1] = end_val;
+         if (((i + 1) % bytes_per_bit) == 0) {
+            start_val &= ~start_mask;
+            end_val &= ~end_mask;
+            start_mask <<= 1;
+            end_mask >>= 1;
+         }
+      }
+   }
 
-/* for I1/D1 caches */
-#define CACHEUSE(L)                                                         \
-                                                                            \
-static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
-{                                                                           \
-   UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
-   UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
-   UWord tag  = a & L.tag_mask;                                             \
-   UWord tag2;                                                              \
-   int i, j, idx;                                                           \
-   UWord *set, tmp_tag; 						    \
-   UInt use_mask;							    \
-                                                                            \
-   TG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n",                \
-	    L.name, a, size, set1, set2);				    \
-                                                                            \
-   /* First case: word entirely within line. */                             \
-   if (set1 == set2) {                                                      \
-                                                                            \
-      set = &(L.tags[set1 * L.assoc]);                                      \
-      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
-	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
-                                                                            \
-      /* This loop is unrolled for just the first case, which is the most */\
-      /* common.  We can't unroll any further because it would screw up   */\
-      /* if we have a direct-mapped (1-way) cache.                        */\
-      if (tag == (set[0] & L.tag_mask)) {                                   \
-        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
-        L.use[idx].count ++;                                                \
-        L.use[idx].mask |= use_mask;                                        \
-	TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
-		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
-		 use_mask, L.use[idx].mask, L.use[idx].count);              \
-	return L1_Hit;							    \
-      }                                                                     \
-      /* If the tag is one other than the MRU, move it into the MRU spot  */\
-      /* and shuffle the rest down.                                       */\
-      for (i = 1; i < L.assoc; i++) {                                       \
-	 if (tag == (set[i] & L.tag_mask)) {			            \
-  	    tmp_tag = set[i];                                               \
-            for (j = i; j > 0; j--) {                                       \
-               set[j] = set[j - 1];                                         \
-            }                                                               \
-            set[0] = tmp_tag;			                            \
-            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
-            L.use[idx].count ++;                                            \
-            L.use[idx].mask |= use_mask;                                    \
-	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
-		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
-		 use_mask, L.use[idx].mask, L.use[idx].count);              \
-            return L1_Hit;                                                  \
-         }                                                                  \
-      }                                                                     \
-                                                                            \
-      /* A miss;  install this tag as MRU, shuffle rest down. */            \
-      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
-      for (j = L.assoc - 1; j > 0; j--) {                                   \
-         set[j] = set[j - 1];                                               \
-      }                                                                     \
-      set[0] = tag | tmp_tag;                                               \
-      idx = (set1 * L.assoc) + tmp_tag;                                     \
-      return update_##L##_use(&L, idx,         			            \
-		       use_mask, a &~ L.line_size_mask);		    \
-                                                                            \
-   /* Second case: word straddles two lines. */                             \
-   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
-   } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
-      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
-      set = &(L.tags[set1 * L.assoc]);                                      \
-      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
-      if (tag == (set[0] & L.tag_mask)) {                                   \
-         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
-         L.use[idx].count ++;                                               \
-         L.use[idx].mask |= use_mask;                                       \
-	TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
-		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
-		 use_mask, L.use[idx].mask, L.use[idx].count);              \
-         goto block2;                                                       \
-      }                                                                     \
-      for (i = 1; i < L.assoc; i++) {                                       \
-	 if (tag == (set[i] & L.tag_mask)) {			            \
-  	    tmp_tag = set[i];                                               \
-            for (j = i; j > 0; j--) {                                       \
-               set[j] = set[j - 1];                                         \
-            }                                                               \
-            set[0] = tmp_tag;                                               \
-            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
-            L.use[idx].count ++;                                            \
-            L.use[idx].mask |= use_mask;                                    \
-	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
-		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
-		 use_mask, L.use[idx].mask, L.use[idx].count);              \
-            goto block2;                                                    \
-         }                                                                  \
-      }                                                                     \
-      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
-      for (j = L.assoc - 1; j > 0; j--) {                                   \
-         set[j] = set[j - 1];                                               \
-      }                                                                     \
-      set[0] = tag | tmp_tag;                                               \
-      idx = (set1 * L.assoc) + tmp_tag;                                     \
-      miss1 = update_##L##_use(&L, idx,        			            \
-		       use_mask, a &~ L.line_size_mask);		    \
-block2:                                                                     \
-      set = &(L.tags[set2 * L.assoc]);                                      \
-      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
-      tag2  = (a+size-1) & L.tag_mask;                                      \
-      if (tag2 == (set[0] & L.tag_mask)) {                                  \
-         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
-         L.use[idx].count ++;                                               \
-         L.use[idx].mask |= use_mask;                                       \
-	TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
-		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
-		 use_mask, L.use[idx].mask, L.use[idx].count);              \
-         return miss1;                                                      \
-      }                                                                     \
-      for (i = 1; i < L.assoc; i++) {                                       \
-	 if (tag2 == (set[i] & L.tag_mask)) {			            \
-  	    tmp_tag = set[i];                                               \
-            for (j = i; j > 0; j--) {                                       \
-               set[j] = set[j - 1];                                         \
-            }                                                               \
-            set[0] = tmp_tag;                                               \
-            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
-            L.use[idx].count ++;                                            \
-            L.use[idx].mask |= use_mask;                                    \
-	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
-		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
-		 use_mask, L.use[idx].mask, L.use[idx].count);              \
-            return miss1;                                                   \
-         }                                                                  \
-      }                                                                     \
-      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
-      for (j = L.assoc - 1; j > 0; j--) {                                   \
-         set[j] = set[j - 1];                                               \
-      }                                                                     \
-      set[0] = tag2 | tmp_tag;                                              \
-      idx = (set2 * L.assoc) + tmp_tag;                                     \
-      miss2 = update_##L##_use(&L, idx,			                    \
-		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
-      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
-                                                                            \
-   } else {                                                                 \
-       VG_(printf)("addr: %#lx  size: %u  sets: %u %u", a, size, set1, set2); \
-       VG_(tool_panic)("item straddles more than two cache sets");          \
-   }                                                                        \
-   return 0;                                                                \
+   TG_DEBUG(6, "Config %s:\n", c->desc_line);
+   for (i = 0; i < c->line_size; i++) {
+      TG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n", i,
+               (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
+   }
+
+   /* We use lower tag bits as offset pointers to cache use info.
+    * I.e. some cache parameters don't work.
+    */
+   if ((1 << c->tag_shift) < c->assoc) {
+      VG_(message)(Vg_DebugMsg,
+                   "error: Use associativity < %d for cache use statistics!\n",
+                   (1 << c->tag_shift));
+      VG_(tool_panic)("Unsupported cache configuration");
+   }
 }
 
+/* for I1/D1 caches */
+#define CACHEUSE(L)                                                            \
+                                                                               \
+   static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
+   {                                                                           \
+      UInt   set1 = (a >> L.line_size_bits) & (L.sets_min_1);                  \
+      UInt   set2 = ((a + size - 1) >> L.line_size_bits) & (L.sets_min_1);     \
+      UWord  tag  = a & L.tag_mask;                                            \
+      UWord  tag2;                                                             \
+      int    i, j, idx;                                                        \
+      UWord *set, tmp_tag;                                                     \
+      UInt   use_mask;                                                         \
+                                                                               \
+      TG_DEBUG(6, "%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n", L.name, a,     \
+               size, set1, set2);                                              \
+                                                                               \
+      /* First case: word entirely within line. */                             \
+      if (set1 == set2) {                                                      \
+                                                                               \
+         set      = &(L.tags[set1 * L.assoc]);                                 \
+         use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
+                    L.line_end_mask[(a + size - 1) & L.line_size_mask];        \
+                                                                               \
+         /* This loop is unrolled for just the first case, which is the most   \
+          */                                                                   \
+         /* common.  We can't unroll any further because it would screw up */  \
+         /* if we have a direct-mapped (1-way) cache. */                       \
+         if (tag == (set[0] & L.tag_mask)) {                                   \
+            idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+            L.use[idx].count++;                                                \
+            L.use[idx].mask |= use_mask;                                       \
+            TG_DEBUG(                                                          \
+               6,                                                              \
+               " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \
+               idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask,      \
+               L.use[idx].mask, L.use[idx].count);                             \
+            return L1_Hit;                                                     \
+         }                                                                     \
+         /* If the tag is one other than the MRU, move it into the MRU spot */ \
+         /* and shuffle the rest down. */                                      \
+         for (i = 1; i < L.assoc; i++) {                                       \
+            if (tag == (set[i] & L.tag_mask)) {                                \
+               tmp_tag = set[i];                                               \
+               for (j = i; j > 0; j--) {                                       \
+                  set[j] = set[j - 1];                                         \
+               }                                                               \
+               set[0] = tmp_tag;                                               \
+               idx    = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);            \
+               L.use[idx].count++;                                             \
+               L.use[idx].mask |= use_mask;                                    \
+               TG_DEBUG(6,                                                     \
+                        " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, "  \
+                        "count %u\n",                                          \
+                        i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr,    \
+                        use_mask, L.use[idx].mask, L.use[idx].count);          \
+               return L1_Hit;                                                  \
+            }                                                                  \
+         }                                                                     \
+                                                                               \
+         /* A miss;  install this tag as MRU, shuffle rest down. */            \
+         tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+         for (j = L.assoc - 1; j > 0; j--) {                                   \
+            set[j] = set[j - 1];                                               \
+         }                                                                     \
+         set[0] = tag | tmp_tag;                                               \
+         idx    = (set1 * L.assoc) + tmp_tag;                                  \
+         return update_##L##_use(&L, idx, use_mask, a & ~L.line_size_mask);    \
+                                                                               \
+         /* Second case: word straddles two lines. */                          \
+         /* Nb: this is a fast way of doing ((set1+1) % L.sets) */             \
+      } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
+         Int miss1 = 0, miss2 = 0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */       \
+         set      = &(L.tags[set1 * L.assoc]);                                 \
+         use_mask = L.line_start_mask[a & L.line_size_mask];                   \
+         if (tag == (set[0] & L.tag_mask)) {                                   \
+            idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+            L.use[idx].count++;                                                \
+            L.use[idx].mask |= use_mask;                                       \
+            TG_DEBUG(                                                          \
+               6,                                                              \
+               " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \
+               idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask,      \
+               L.use[idx].mask, L.use[idx].count);                             \
+            goto block2;                                                       \
+         }                                                                     \
+         for (i = 1; i < L.assoc; i++) {                                       \
+            if (tag == (set[i] & L.tag_mask)) {                                \
+               tmp_tag = set[i];                                               \
+               for (j = i; j > 0; j--) {                                       \
+                  set[j] = set[j - 1];                                         \
+               }                                                               \
+               set[0] = tmp_tag;                                               \
+               idx    = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);            \
+               L.use[idx].count++;                                             \
+               L.use[idx].mask |= use_mask;                                    \
+               TG_DEBUG(6,                                                     \
+                        " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, "  \
+                        "count %u\n",                                          \
+                        i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr,    \
+                        use_mask, L.use[idx].mask, L.use[idx].count);          \
+               goto block2;                                                    \
+            }                                                                  \
+         }                                                                     \
+         tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+         for (j = L.assoc - 1; j > 0; j--) {                                   \
+            set[j] = set[j - 1];                                               \
+         }                                                                     \
+         set[0] = tag | tmp_tag;                                               \
+         idx    = (set1 * L.assoc) + tmp_tag;                                  \
+         miss1  = update_##L##_use(&L, idx, use_mask, a & ~L.line_size_mask);  \
+      block2:                                                                  \
+         set      = &(L.tags[set2 * L.assoc]);                                 \
+         use_mask = L.line_end_mask[(a + size - 1) & L.line_size_mask];        \
+         tag2     = (a + size - 1) & L.tag_mask;                               \
+         if (tag2 == (set[0] & L.tag_mask)) {                                  \
+            idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+            L.use[idx].count++;                                                \
+            L.use[idx].mask |= use_mask;                                       \
+            TG_DEBUG(                                                          \
+               6,                                                              \
+               " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \
+               idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask,      \
+               L.use[idx].mask, L.use[idx].count);                             \
+            return miss1;                                                      \
+         }                                                                     \
+         for (i = 1; i < L.assoc; i++) {                                       \
+            if (tag2 == (set[i] & L.tag_mask)) {                               \
+               tmp_tag = set[i];                                               \
+               for (j = i; j > 0; j--) {                                       \
+                  set[j] = set[j - 1];                                         \
+               }                                                               \
+               set[0] = tmp_tag;                                               \
+               idx    = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);            \
+               L.use[idx].count++;                                             \
+               L.use[idx].mask |= use_mask;                                    \
+               TG_DEBUG(6,                                                     \
+                        " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, "  \
+                        "count %u\n",                                          \
+                        i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr,    \
+                        use_mask, L.use[idx].mask, L.use[idx].count);          \
+               return miss1;                                                   \
+            }                                                                  \
+         }                                                                     \
+         tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+         for (j = L.assoc - 1; j > 0; j--) {                                   \
+            set[j] = set[j - 1];                                               \
+         }                                                                     \
+         set[0] = tag2 | tmp_tag;                                              \
+         idx    = (set2 * L.assoc) + tmp_tag;                                  \
+         miss2  = update_##L##_use(&L, idx, use_mask,                          \
+                                   (a + size - 1) & ~L.line_size_mask);        \
+         return (miss1 == MemAccess || miss2 == MemAccess) ? MemAccess         \
+                                                           : LL_Hit;           \
+                                                                               \
+      } else {                                                                 \
+         VG_(printf)("addr: %#lx  size: %u  sets: %u %u", a, size, set1,       \
+                     set2);                                                    \
+         VG_(tool_panic)("item straddles more than two cache sets");           \
+      }                                                                        \
+      return 0;                                                                \
+   }
 
 /* logarithmic bitcounting algorithm, see
  * http://graphics.stanford.edu/~seander/bithacks.html
  */
 static __inline__ unsigned int countBits(unsigned int bits)
 {
-  unsigned int c; // store the total here
-  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
-  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
-
-  c = bits;
-  c = ((c >> S[0]) & B[0]) + (c & B[0]);
-  c = ((c >> S[1]) & B[1]) + (c & B[1]);
-  c = ((c >> S[2]) & B[2]) + (c & B[2]);
-  c = ((c >> S[3]) & B[3]) + (c & B[3]);
-  c = ((c >> S[4]) & B[4]) + (c & B[4]);
-  return c;
+   unsigned int c;                      // store the total here
+   const int    S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
+   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
+
+   c = bits;
+   c = ((c >> S[0]) & B[0]) + (c & B[0]);
+   c = ((c >> S[1]) & B[1]) + (c & B[1]);
+   c = ((c >> S[2]) & B[2]) + (c & B[2]);
+   c = ((c >> S[3]) & B[3]) + (c & B[3]);
+   c = ((c >> S[4]) & B[4]) + (c & B[4]);
+   return c;
 }
 
 static void update_LL_use(int idx, Addr memline)
 {
-  line_loaded* loaded = &(LL.loaded[idx]);
-  line_use* use = &(LL.use[idx]);
-  int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
-  
-  TG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
-           idx, TG_(bb_base) + current_ii->instr_offset, memline);
-  if (use->count>0) {
-    TG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
-	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
-    TG_DEBUG(2, "   collect: %d, use_base %p\n",
-	     TG_(current_state).collect, loaded->use_base);
-    
-    if (TG_(current_state).collect && loaded->use_base) {
-      (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
-      (loaded->use_base)[off_LL_SpLoss] += i;
-    }
+   line_loaded* loaded = &(LL.loaded[idx]);
+   line_use*    use    = &(LL.use[idx]);
+   int          i      = ((32 - countBits(use->mask)) * LL.line_size) >> 5;
+
+   TG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n", idx,
+            TG_(bb_base) + current_ii->instr_offset, memline);
+   if (use->count > 0) {
+      TG_DEBUG(2,
+               "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
+               use->count, i, use->mask, loaded->memline, loaded->iaddr);
+      TG_DEBUG(2, "   collect: %d, use_base %p\n", TG_(current_state).collect,
+               loaded->use_base);
+
+      if (TG_(current_state).collect && loaded->use_base) {
+         (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
+         (loaded->use_base)[off_LL_SpLoss] += i;
+      }
    }
 
    use->count = 0;
    use->mask  = 0;
 
-  loaded->memline = memline;
-  loaded->iaddr   = TG_(bb_base) + current_ii->instr_offset;
-  loaded->use_base = (TG_(current_state).nonskipped) ?
-    TG_(current_state).nonskipped->skipped :
-    TG_(cost_base) + current_ii->cost_offset;
+   loaded->memline  = memline;
+   loaded->iaddr    = TG_(bb_base) + current_ii->instr_offset;
+   loaded->use_base = (TG_(current_state).nonskipped)
+                         ? TG_(current_state).nonskipped->skipped
+                         : TG_(cost_base) + current_ii->cost_offset;
 }
 
-static
-CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
+static CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
 {
-   UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
-   UWord* set = &(LL.tags[setNo * LL.assoc]);
-   UWord tag  = memline & LL.tag_mask;
+   UInt   setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
+   UWord* set   = &(LL.tags[setNo * LL.assoc]);
+   UWord  tag   = memline & LL.tag_mask;
 
-   int i, j, idx;
+   int   i, j, idx;
    UWord tmp_tag;
-   
-   TG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
+
+   TG_DEBUG(6, "LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
 
    if (tag == (set[0] & LL.tag_mask)) {
-     idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
-     l1_loaded->dep_use = &(LL.use[idx]);
+      idx                = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
+      l1_loaded->dep_use = &(LL.use[idx]);
 
-     TG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
-		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
-		 LL.use[idx].mask, LL.use[idx].count);
-     return LL_Hit;
+      TG_DEBUG(6, " Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
+               idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
+               LL.use[idx].mask, LL.use[idx].count);
+      return LL_Hit;
    }
    for (i = 1; i < LL.assoc; i++) {
-     if (tag == (set[i] & LL.tag_mask)) {
-       tmp_tag = set[i];
-       for (j = i; j > 0; j--) {
-	 set[j] = set[j - 1];
-       }
-       set[0] = tmp_tag;
-       idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
-       l1_loaded->dep_use = &(LL.use[idx]);
-
-	TG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
-		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
-		 LL.use[idx].mask, LL.use[idx].count);
-	return LL_Hit;
-     }
+      if (tag == (set[i] & LL.tag_mask)) {
+         tmp_tag = set[i];
+         for (j = i; j > 0; j--) {
+            set[j] = set[j - 1];
+         }
+         set[0]             = tmp_tag;
+         idx                = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
+         l1_loaded->dep_use = &(LL.use[idx]);
+
+         TG_DEBUG(6,
+                  " Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
+                  i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
+                  LL.use[idx].mask, LL.use[idx].count);
+         return LL_Hit;
+      }
    }
 
    /* A miss;  install this tag as MRU, shuffle rest down. */
    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
    for (j = LL.assoc - 1; j > 0; j--) {
-     set[j] = set[j - 1];
+      set[j] = set[j - 1];
    }
-   set[0] = tag | tmp_tag;
-   idx = (setNo * LL.assoc) + tmp_tag;
+   set[0]             = tag | tmp_tag;
+   idx                = (setNo * LL.assoc) + tmp_tag;
    l1_loaded->dep_use = &(LL.use[idx]);
 
    update_LL_use(idx, memline);
@@ -929,47 +936,48 @@ CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
    return MemAccess;
 }
 
-
-
-
-#define UPDATE_USE(L)					             \
-                                                                     \
-static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
-			       UInt mask, Addr memline)		     \
-{                                                                    \
-  line_loaded* loaded = &(cache->loaded[idx]);			     \
-  line_use* use = &(cache->use[idx]);				     \
-  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
-                                                                     \
-  TG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
-           cache->name, idx, TG_(bb_base) + current_ii->instr_offset, memline, mask); \
-  if (use->count>0) {                                                \
-    TG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
-	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
-    TG_DEBUG(2, "   collect: %d, use_base %p\n", \
-	     TG_(current_state).collect, loaded->use_base);	     \
-                                                                     \
-    if (TG_(current_state).collect && loaded->use_base) {           \
-      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
-      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
-                                                                     \
-      /* FIXME (?): L1/LL line sizes must be equal ! */              \
-      loaded->dep_use->mask |= use->mask;                            \
-      loaded->dep_use->count += use->count;                          \
-    }                                                                \
-  }                                                                  \
-                                                                     \
-  use->count = 1;                                                    \
-  use->mask  = mask;                                                 \
-  loaded->memline = memline;                                         \
-  loaded->iaddr   = TG_(bb_base) + current_ii->instr_offset;        \
-  loaded->use_base = (TG_(current_state).nonskipped) ?              \
-    TG_(current_state).nonskipped->skipped :                        \
-    TG_(cost_base) + current_ii->cost_offset;                       \
-                                                                     \
-  if (memline == 0) return LL_Hit;                                   \
-  return cacheuse_LL_access(memline, loaded);                        \
-}
+#define UPDATE_USE(L)                                                          \
+                                                                               \
+   static CacheModelResult update##_##L##_use(cache_t2* cache, int idx,        \
+                                              UInt mask, Addr memline)         \
+   {                                                                           \
+      line_loaded* loaded = &(cache->loaded[idx]);                             \
+      line_use*    use    = &(cache->use[idx]);                                \
+      int          c = ((32 - countBits(use->mask)) * cache->line_size) >> 5;  \
+                                                                               \
+      TG_DEBUG(2,                                                              \
+               " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n",  \
+               cache->name, idx, TG_(bb_base) + current_ii->instr_offset,      \
+               memline, mask);                                                 \
+      if (use->count > 0) {                                                    \
+         TG_DEBUG(                                                             \
+            2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n", \
+            use->count, c, use->mask, loaded->memline, loaded->iaddr);         \
+         TG_DEBUG(2, "   collect: %d, use_base %p\n",                          \
+                  TG_(current_state).collect, loaded->use_base);               \
+                                                                               \
+         if (TG_(current_state).collect && loaded->use_base) {                 \
+            (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;         \
+            (loaded->use_base)[off_##L##_SpLoss] += c;                         \
+                                                                               \
+            /* FIXME (?): L1/LL line sizes must be equal ! */                  \
+            loaded->dep_use->mask |= use->mask;                                \
+            loaded->dep_use->count += use->count;                              \
+         }                                                                     \
+      }                                                                        \
+                                                                               \
+      use->count       = 1;                                                    \
+      use->mask        = mask;                                                 \
+      loaded->memline  = memline;                                              \
+      loaded->iaddr    = TG_(bb_base) + current_ii->instr_offset;              \
+      loaded->use_base = (TG_(current_state).nonskipped)                       \
+                            ? TG_(current_state).nonskipped->skipped           \
+                            : TG_(cost_base) + current_ii->cost_offset;        \
+                                                                               \
+      if (memline == 0)                                                        \
+         return LL_Hit;                                                        \
+      return cacheuse_LL_access(memline, loaded);                              \
+   }
 
 UPDATE_USE(I1);
 UPDATE_USE(D1);
@@ -977,179 +985,187 @@ UPDATE_USE(D1);
 CACHEUSE(I1);
 CACHEUSE(D1);
 
-
-static
-void cacheuse_finish(void)
+static void cacheuse_finish(void)
 {
-  int i;
-  InstrInfo ii = { 0,0,0,0 };
+   int       i;
+   InstrInfo ii = {0, 0, 0, 0};
 
-  if (!TG_(current_state).collect) return;
+   if (!TG_(current_state).collect)
+      return;
 
-  TG_(bb_base) = 0;
-  current_ii = &ii; /* needs to be set for update_XX_use */
-  TG_(cost_base) = 0;
+   TG_(bb_base)   = 0;
+   current_ii     = &ii; /* needs to be set for update_XX_use */
+   TG_(cost_base) = 0;
 
-  /* update usage counters */
-  if (I1.use)
-    for (i = 0; i < I1.sets * I1.assoc; i++)
-      if (I1.loaded[i].use_base)
-	update_I1_use( &I1, i, 0,0);
+   /* update usage counters */
+   if (I1.use)
+      for (i = 0; i < I1.sets * I1.assoc; i++)
+         if (I1.loaded[i].use_base)
+            update_I1_use(&I1, i, 0, 0);
 
-  if (D1.use)
-    for (i = 0; i < D1.sets * D1.assoc; i++)
-      if (D1.loaded[i].use_base)
-	update_D1_use( &D1, i, 0,0);
+   if (D1.use)
+      for (i = 0; i < D1.sets * D1.assoc; i++)
+         if (D1.loaded[i].use_base)
+            update_D1_use(&D1, i, 0, 0);
 
-  if (LL.use)
-    for (i = 0; i < LL.sets * LL.assoc; i++)
-      if (LL.loaded[i].use_base)
-	update_LL_use(i, 0);
+   if (LL.use)
+      for (i = 0; i < LL.sets * LL.assoc; i++)
+         if (LL.loaded[i].use_base)
+            update_LL_use(i, 0);
 
-  current_ii = 0;
+   current_ii = 0;
 }
-  
-
 
 /*------------------------------------------------------------*/
 /*--- Helper functions called by instrumented code         ---*/
 /*------------------------------------------------------------*/
 
-
-static __inline__
-void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
+static __inline__ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
 {
-    switch(r) {
-	case WriteBackMemAccess:
-	    if (clo_simulate_writeback) {
-		c1[3]++;
-		c2[3]++;
-	    }
-	    // fall through
-
-	case MemAccess:
-	    c1[2]++;
-	    c2[2]++;
-	    // fall through
-
-	case LL_Hit:
-	    c1[1]++;
-	    c2[1]++;
-	    // fall through
-
-	default:
-	    c1[0]++;
-	    c2[0]++;
-    }
+   switch (r) {
+   case WriteBackMemAccess:
+      if (clo_simulate_writeback) {
+         c1[3]++;
+         c2[3]++;
+      }
+      // fall through
+
+   case MemAccess:
+      c1[2]++;
+      c2[2]++;
+      // fall through
+
+   case LL_Hit:
+      c1[1]++;
+      c2[1]++;
+      // fall through
+
+   default:
+      c1[0]++;
+      c2[0]++;
+   }
 }
 
-static
-const HChar* cacheRes(CacheModelResult r)
+static const HChar* cacheRes(CacheModelResult r)
 {
-    switch(r) {
-    case L1_Hit:    return "L1 Hit ";
-    case LL_Hit:    return "LL Hit ";
-    case MemAccess: return "LL Miss";
-    case WriteBackMemAccess: return "LL Miss (dirty)";
-    default:
-	tl_assert(0);
-    }
-    return "??";
+   switch (r) {
+   case L1_Hit:
+      return "L1 Hit ";
+   case LL_Hit:
+      return "LL Hit ";
+   case MemAccess:
+      return "LL Miss";
+   case WriteBackMemAccess:
+      return "LL Miss (dirty)";
+   default:
+      tl_assert(0);
+   }
+   return "??";
 }
 
 VG_REGPARM(1)
 static void log_1I0D(InstrInfo* ii)
 {
-    CacheModelResult IrRes;
+   CacheModelResult IrRes;
 
-    current_ii = ii;
-    IrRes = (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+   current_ii = ii;
+   IrRes =
+      (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
 
-    TG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
-              TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
+   TG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
 
-    if (TG_(current_state).collect) {
-	ULong* cost_Ir;
+   if (TG_(current_state).collect) {
+      ULong* cost_Ir;
 
-	if (TG_(current_state).nonskipped)
-	    cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
-	else
-            cost_Ir = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+      if (TG_(current_state).nonskipped)
+         cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+      else
+         cost_Ir =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
 
-	inc_costs(IrRes, cost_Ir, 
-		  TG_(current_state).cost + fullOffset(EG_IR) );
-    }
+      inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR));
+   }
 }
 
 VG_REGPARM(2)
 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
 {
-    CacheModelResult Ir1Res, Ir2Res;
-    ULong *global_cost_Ir;
-
-    current_ii = ii1;
-    Ir1Res = (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
-    current_ii = ii2;
-    Ir2Res = (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
-
-    TG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
-              TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
-              TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
-
-    if (!TG_(current_state).collect) return;
-
-    global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
-    if (TG_(current_state).nonskipped) {
-	ULong* skipped_cost_Ir =
-	    TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
-
-	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
-	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
-	return;
-    }
-
-    inc_costs(Ir1Res, global_cost_Ir,
-              TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
-    inc_costs(Ir2Res, global_cost_Ir,
-              TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
+   CacheModelResult Ir1Res, Ir2Res;
+   ULong*           global_cost_Ir;
+
+   current_ii = ii1;
+   Ir1Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
+   current_ii = ii2;
+   Ir2Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
+
+   TG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
+            TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+            TG_(bb_base) + ii2->instr_offset, ii2->instr_size,
+            cacheRes(Ir2Res));
+
+   if (!TG_(current_state).collect)
+      return;
+
+   global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
+   if (TG_(current_state).nonskipped) {
+      ULong* skipped_cost_Ir =
+         TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+
+      inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+      inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+      return;
+   }
+
+   inc_costs(Ir1Res, global_cost_Ir,
+             TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
+   inc_costs(Ir2Res, global_cost_Ir,
+             TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
 }
 
 VG_REGPARM(3)
 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
 {
-    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
-    ULong *global_cost_Ir;
-
-    current_ii = ii1;
-    Ir1Res = (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
-    current_ii = ii2;
-    Ir2Res = (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
-    current_ii = ii3;
-    Ir3Res = (*simulator.I1_Read)(TG_(bb_base) + ii3->instr_offset, ii3->instr_size);
-
-    TG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
-              TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
-              TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
-              TG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
-
-    if (!TG_(current_state).collect) return;
-
-    global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
-    if (TG_(current_state).nonskipped) {
-	ULong* skipped_cost_Ir =
-	    TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
-	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
-	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
-	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
-	return;
-    }
-
-    inc_costs(Ir1Res, global_cost_Ir,
-              TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
-    inc_costs(Ir2Res, global_cost_Ir,
-              TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
-    inc_costs(Ir3Res, global_cost_Ir,
-              TG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
+   CacheModelResult Ir1Res, Ir2Res, Ir3Res;
+   ULong*           global_cost_Ir;
+
+   current_ii = ii1;
+   Ir1Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
+   current_ii = ii2;
+   Ir2Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
+   current_ii = ii3;
+   Ir3Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii3->instr_offset, ii3->instr_size);
+
+   TG_DEBUG(
+      6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
+      TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+      TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
+      TG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res));
+
+   if (!TG_(current_state).collect)
+      return;
+
+   global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
+   if (TG_(current_state).nonskipped) {
+      ULong* skipped_cost_Ir =
+         TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+      inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+      inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+      inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
+      return;
+   }
+
+   inc_costs(Ir1Res, global_cost_Ir,
+             TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
+   inc_costs(Ir2Res, global_cost_Ir,
+             TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
+   inc_costs(Ir3Res, global_cost_Ir,
+             TG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
 }
 
 /* Instruction doing a read access */
@@ -1157,35 +1173,34 @@ static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
 VG_REGPARM(3)
 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
-    CacheModelResult IrRes, DrRes;
-
-    current_ii = ii;
-    IrRes = (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data_addr, data_size);
-
-    TG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%ld => %s\n",
-              TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
-	      data_addr, data_size, cacheRes(DrRes));
-
-    if (TG_(current_state).collect) {
-	ULong *cost_Ir, *cost_Dr;
-	
-	if (TG_(current_state).nonskipped) {
-	    cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
-	    cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
-	}
-	else {
-            cost_Ir = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
-            cost_Dr = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
-	}
-       
-	inc_costs(IrRes, cost_Ir, 
-		  TG_(current_state).cost + fullOffset(EG_IR) );
-	inc_costs(DrRes, cost_Dr,
-		  TG_(current_state).cost + fullOffset(EG_DR) );
-    }
-}
+   CacheModelResult IrRes, DrRes;
+
+   current_ii = ii;
+   IrRes =
+      (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+   DrRes = (*simulator.D1_Read)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%ld => %s\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+            data_addr, data_size, cacheRes(DrRes));
+
+   if (TG_(current_state).collect) {
+      ULong *cost_Ir, *cost_Dr;
+
+      if (TG_(current_state).nonskipped) {
+         cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+         cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
+      } else {
+         cost_Ir =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+         cost_Dr =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
+      }
 
+      inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR));
+      inc_costs(DrRes, cost_Dr, TG_(current_state).cost + fullOffset(EG_DR));
+   }
+}
 
 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
    have exactly the same prototype.  If you change them, you must
@@ -1193,88 +1208,85 @@ static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 VG_REGPARM(3)
 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
-    CacheModelResult DrRes;
+   CacheModelResult DrRes;
 
-    current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data_addr, data_size);
+   current_ii = ii;
+   DrRes      = (*simulator.D1_Read)(data_addr, data_size);
 
-    TG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%ld => %s\n",
-	      data_addr, data_size, cacheRes(DrRes));
+   TG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%ld => %s\n", data_addr, data_size,
+            cacheRes(DrRes));
 
-    if (TG_(current_state).collect) {
-	ULong *cost_Dr;
-	
-	if (TG_(current_state).nonskipped)
-	    cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
-	else
-            cost_Dr = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
+   if (TG_(current_state).collect) {
+      ULong* cost_Dr;
 
-	inc_costs(DrRes, cost_Dr,
-		  TG_(current_state).cost + fullOffset(EG_DR) );
-    }
-}
+      if (TG_(current_state).nonskipped)
+         cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
+      else
+         cost_Dr =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
 
+      inc_costs(DrRes, cost_Dr, TG_(current_state).cost + fullOffset(EG_DR));
+   }
+}
 
 /* Instruction doing a write access */
 
 VG_REGPARM(3)
 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
-    CacheModelResult IrRes, DwRes;
-
-    current_ii = ii;
-    IrRes = (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
-    DwRes = (*simulator.D1_Write)(data_addr, data_size);
-
-    TG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%ld => %s\n",
-              TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
-	      data_addr, data_size, cacheRes(DwRes));
-
-    if (TG_(current_state).collect) {
-	ULong *cost_Ir, *cost_Dw;
-	
-	if (TG_(current_state).nonskipped) {
-	    cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
-	    cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
-	}
-	else {
-            cost_Ir = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
-            cost_Dw = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
-	}
-       
-	inc_costs(IrRes, cost_Ir,
-		  TG_(current_state).cost + fullOffset(EG_IR) );
-	inc_costs(DwRes, cost_Dw,
-		  TG_(current_state).cost + fullOffset(EG_DW) );
-    }
+   CacheModelResult IrRes, DwRes;
+
+   current_ii = ii;
+   IrRes =
+      (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+   DwRes = (*simulator.D1_Write)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%ld => %s\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+            data_addr, data_size, cacheRes(DwRes));
+
+   if (TG_(current_state).collect) {
+      ULong *cost_Ir, *cost_Dw;
+
+      if (TG_(current_state).nonskipped) {
+         cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+         cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
+      } else {
+         cost_Ir =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+         cost_Dw =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
+      }
+
+      inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR));
+      inc_costs(DwRes, cost_Dw, TG_(current_state).cost + fullOffset(EG_DW));
+   }
 }
 
 /* See comment on log_0I1Dr. */
 VG_REGPARM(3)
 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
-    CacheModelResult DwRes;
-
-    current_ii = ii;
-    DwRes = (*simulator.D1_Write)(data_addr, data_size);
-
-    TG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%ld => %s\n",
-	      data_addr, data_size, cacheRes(DwRes));
-
-    if (TG_(current_state).collect) {
-	ULong *cost_Dw;
-	
-	if (TG_(current_state).nonskipped)
-	    cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
-	else
-            cost_Dw = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
-       
-	inc_costs(DwRes, cost_Dw,
-		  TG_(current_state).cost + fullOffset(EG_DW) );
-    }
-}
+   CacheModelResult DwRes;
+
+   current_ii = ii;
+   DwRes      = (*simulator.D1_Write)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%ld => %s\n", data_addr, data_size,
+            cacheRes(DwRes));
+
+   if (TG_(current_state).collect) {
+      ULong* cost_Dw;
 
+      if (TG_(current_state).nonskipped)
+         cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
+      else
+         cost_Dw =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
 
+      inc_costs(DwRes, cost_Dw, TG_(current_state).cost + fullOffset(EG_DW));
+   }
+}
 
 /*------------------------------------------------------------*/
 /*--- Cache configuration                                  ---*/
@@ -1287,163 +1299,154 @@ static cache_t clo_LL_cache = UNDEFINED_CACHE;
 /* Initialize and clear simulator state */
 static void cachesim_post_clo_init(void)
 {
-  /* Cache configurations. */
-  cache_t  I1c, D1c, LLc;
-
-  /* Initialize access handlers */
-  if (!TG_(clo).simulate_cache) {
-    TG_(cachesim).log_1I0D  = 0;
-    TG_(cachesim).log_1I0D_name = "(no function)";
-    TG_(cachesim).log_2I0D  = 0;
-    TG_(cachesim).log_2I0D_name = "(no function)";
-    TG_(cachesim).log_3I0D  = 0;
-    TG_(cachesim).log_3I0D_name = "(no function)";
-
-    TG_(cachesim).log_1I1Dr = 0;
-    TG_(cachesim).log_1I1Dr_name = "(no function)";
-    TG_(cachesim).log_1I1Dw = 0;
-    TG_(cachesim).log_1I1Dw_name = "(no function)";
-
-    TG_(cachesim).log_0I1Dr = 0;
-    TG_(cachesim).log_0I1Dr_name = "(no function)";
-    TG_(cachesim).log_0I1Dw = 0;
-    TG_(cachesim).log_0I1Dw_name = "(no function)";
-    return;
-  }
-
-  /* Configuration of caches only needed with real cache simulation */
-  VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
-                                      &clo_I1_cache,
-                                      &clo_D1_cache,
-                                      &clo_LL_cache);
-
-  I1.name = "I1";
-  D1.name = "D1";
-  LL.name = "LL";
-
-  // min_line_size is used to make sure that we never feed
-  // accesses to the simulator straddling more than two
-  // cache lines at any cache level
-  TG_(min_line_size) = (I1c.line_size < D1c.line_size)
-                           ? I1c.line_size : D1c.line_size;
-  TG_(min_line_size) = (LLc.line_size < TG_(min_line_size))
-                           ? LLc.line_size : TG_(min_line_size);
-
-  Int largest_load_or_store_size
-     = VG_(machine_get_size_of_largest_guest_register)();
-  if (TG_(min_line_size) < largest_load_or_store_size) {
-     /* We can't continue, because the cache simulation might
-        straddle more than 2 lines, and it will assert.  So let's
-        just stop before we start. */
-     VG_(umsg)("Tracegrind: cannot continue: the minimum line size (%d)\n",
-               (Int)TG_(min_line_size));
-     VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
-               largest_load_or_store_size );
-     VG_(umsg)("  but it is not.  Exiting now.\n");
-     VG_(exit)(1);
-  }
-
-  cachesim_initcache(I1c, &I1);
-  cachesim_initcache(D1c, &D1);
-  cachesim_initcache(LLc, &LL);
-
-  /* the other cache simulators use the standard helpers
-   * with dispatching via simulator struct */
-
-  TG_(cachesim).log_1I0D  = log_1I0D;
-  TG_(cachesim).log_1I0D_name  = "log_1I0D";
-  TG_(cachesim).log_2I0D  = log_2I0D;
-  TG_(cachesim).log_2I0D_name  = "log_2I0D";
-  TG_(cachesim).log_3I0D  = log_3I0D;
-  TG_(cachesim).log_3I0D_name  = "log_3I0D";
-
-  TG_(cachesim).log_1I1Dr = log_1I1Dr;
-  TG_(cachesim).log_1I1Dw = log_1I1Dw;
-  TG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
-  TG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
-
-  TG_(cachesim).log_0I1Dr = log_0I1Dr;
-  TG_(cachesim).log_0I1Dw = log_0I1Dw;
-  TG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
-  TG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
-
-  if (clo_collect_cacheuse) {
+   /* Cache configurations. */
+   cache_t I1c, D1c, LLc;
+
+   /* Initialize access handlers */
+   if (!TG_(clo).simulate_cache) {
+      TG_(cachesim).log_1I0D      = 0;
+      TG_(cachesim).log_1I0D_name = "(no function)";
+      TG_(cachesim).log_2I0D      = 0;
+      TG_(cachesim).log_2I0D_name = "(no function)";
+      TG_(cachesim).log_3I0D      = 0;
+      TG_(cachesim).log_3I0D_name = "(no function)";
+
+      TG_(cachesim).log_1I1Dr      = 0;
+      TG_(cachesim).log_1I1Dr_name = "(no function)";
+      TG_(cachesim).log_1I1Dw      = 0;
+      TG_(cachesim).log_1I1Dw_name = "(no function)";
+
+      TG_(cachesim).log_0I1Dr      = 0;
+      TG_(cachesim).log_0I1Dr_name = "(no function)";
+      TG_(cachesim).log_0I1Dw      = 0;
+      TG_(cachesim).log_0I1Dw_name = "(no function)";
+      return;
+   }
+
+   /* Configuration of caches only needed with real cache simulation */
+   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc, &clo_I1_cache,
+                                       &clo_D1_cache, &clo_LL_cache);
+
+   I1.name = "I1";
+   D1.name = "D1";
+   LL.name = "LL";
+
+   // min_line_size is used to make sure that we never feed
+   // accesses to the simulator straddling more than two
+   // cache lines at any cache level
+   TG_(min_line_size) =
+      (I1c.line_size < D1c.line_size) ? I1c.line_size : D1c.line_size;
+   TG_(min_line_size) =
+      (LLc.line_size < TG_(min_line_size)) ? LLc.line_size : TG_(min_line_size);
+
+   Int largest_load_or_store_size =
+      VG_(machine_get_size_of_largest_guest_register)();
+   if (TG_(min_line_size) < largest_load_or_store_size) {
+      /* We can't continue, because the cache simulation might
+         straddle more than 2 lines, and it will assert.  So let's
+         just stop before we start. */
+      VG_(umsg)("Tracegrind: cannot continue: the minimum line size (%d)\n",
+                (Int)TG_(min_line_size));
+      VG_(umsg)(
+         "  must be equal to or larger than the maximum register size (%d)\n",
+         largest_load_or_store_size);
+      VG_(umsg)("  but it is not.  Exiting now.\n");
+      VG_(exit)(1);
+   }
+
+   cachesim_initcache(I1c, &I1);
+   cachesim_initcache(D1c, &D1);
+   cachesim_initcache(LLc, &LL);
+
+   /* the other cache simulators use the standard helpers
+    * with dispatching via simulator struct */
+
+   TG_(cachesim).log_1I0D      = log_1I0D;
+   TG_(cachesim).log_1I0D_name = "log_1I0D";
+   TG_(cachesim).log_2I0D      = log_2I0D;
+   TG_(cachesim).log_2I0D_name = "log_2I0D";
+   TG_(cachesim).log_3I0D      = log_3I0D;
+   TG_(cachesim).log_3I0D_name = "log_3I0D";
+
+   TG_(cachesim).log_1I1Dr      = log_1I1Dr;
+   TG_(cachesim).log_1I1Dw      = log_1I1Dw;
+   TG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
+   TG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
+
+   TG_(cachesim).log_0I1Dr      = log_0I1Dr;
+   TG_(cachesim).log_0I1Dw      = log_0I1Dw;
+   TG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
+   TG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
+
+   if (clo_collect_cacheuse) {
 
       /* Output warning for not supported option combinations */
       if (clo_simulate_hwpref) {
-	  VG_(message)(Vg_DebugMsg,
-		       "warning: prefetch simulation can not be "
-                       "used with cache usage\n");
-	  clo_simulate_hwpref = False;
+         VG_(message)(Vg_DebugMsg, "warning: prefetch simulation can not be "
+                                   "used with cache usage\n");
+         clo_simulate_hwpref = False;
       }
 
       if (clo_simulate_writeback) {
-	  VG_(message)(Vg_DebugMsg,
-		       "warning: write-back simulation can not be "
-                       "used with cache usage\n");
-	  clo_simulate_writeback = False;
+         VG_(message)(Vg_DebugMsg, "warning: write-back simulation can not be "
+                                   "used with cache usage\n");
+         clo_simulate_writeback = False;
       }
 
       simulator.I1_Read  = cacheuse_I1_doRead;
       simulator.D1_Read  = cacheuse_D1_doRead;
       simulator.D1_Write = cacheuse_D1_doRead;
       return;
-  }
-
-  if (clo_simulate_hwpref) {
-    prefetch_clear();
-
-    if (clo_simulate_writeback) {
-      simulator.I1_Read  = prefetch_I1_Read;
-      simulator.D1_Read  = prefetch_D1_Read;
-      simulator.D1_Write = prefetch_D1_Write;
-    }
-    else {
-      simulator.I1_Read  = prefetch_I1_ref;
-      simulator.D1_Read  = prefetch_D1_ref;
-      simulator.D1_Write = prefetch_D1_ref;
-    }
-
-    return;
-  }
-
-  if (clo_simulate_writeback) {
+   }
+
+   if (clo_simulate_hwpref) {
+      prefetch_clear();
+
+      if (clo_simulate_writeback) {
+         simulator.I1_Read  = prefetch_I1_Read;
+         simulator.D1_Read  = prefetch_D1_Read;
+         simulator.D1_Write = prefetch_D1_Write;
+      } else {
+         simulator.I1_Read  = prefetch_I1_ref;
+         simulator.D1_Read  = prefetch_D1_ref;
+         simulator.D1_Write = prefetch_D1_ref;
+      }
+
+      return;
+   }
+
+   if (clo_simulate_writeback) {
       simulator.I1_Read  = cachesim_I1_Read;
       simulator.D1_Read  = cachesim_D1_Read;
       simulator.D1_Write = cachesim_D1_Write;
-  }
-  else {
+   } else {
       simulator.I1_Read  = cachesim_I1_ref;
       simulator.D1_Read  = cachesim_D1_ref;
       simulator.D1_Write = cachesim_D1_ref;
-  }
+   }
 }
 
-
 /* Clear simulator state. Has to be initialized before */
-static
-void cachesim_clear(void)
+static void cachesim_clear(void)
 {
-  cachesim_clearcache(&I1);
-  cachesim_clearcache(&D1);
-  cachesim_clearcache(&LL);
+   cachesim_clearcache(&I1);
+   cachesim_clearcache(&D1);
+   cachesim_clearcache(&LL);
 
-  prefetch_clear();
+   prefetch_clear();
 }
 
-
-static
-void cachesim_print_opts(void)
+static void cachesim_print_opts(void)
 {
-  VG_(printf)(
-"\n   cache simulator options (does cache simulation if used):\n"
-"    --simulate-wb=no|yes      Count write-back events [no]\n"
-"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
+   VG_(printf)(
+      "\n   cache simulator options (does cache simulation if used):\n"
+      "    --simulate-wb=no|yes      Count write-back events [no]\n"
+      "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
 #if TG_EXPERIMENTAL
-"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
+      "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
 #endif
-"    --cacheuse=no|yes         Collect cache block use [no]\n");
-  VG_(print_cache_clo_opts)();
+      "    --cacheuse=no|yes         Collect cache block use [no]\n");
+   VG_(print_cache_clo_opts)();
 }
 
 /* Check for command line option for cache configuration.
@@ -1453,128 +1456,120 @@ void cachesim_print_opts(void)
  */
 static Bool cachesim_parse_opt(const HChar* arg)
 {
-   if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
-   else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
-   else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
+   if VG_BOOL_CLO (arg, "--simulate-wb", clo_simulate_writeback) {
+   } else if VG_BOOL_CLO (arg, "--simulate-hwpref", clo_simulate_hwpref) {
+   } else if VG_BOOL_CLO (arg, "--simulate-sectors", clo_simulate_sectors) {
+   }
 
-   else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
+   else if VG_BOOL_CLO (arg, "--cacheuse", clo_collect_cacheuse) {
    }
 
-   else if (VG_(str_clo_cache_opt)(arg,
-                                   &clo_I1_cache,
-                                   &clo_D1_cache,
-                                   &clo_LL_cache)) {}
+   else if (VG_(str_clo_cache_opt)(arg, &clo_I1_cache, &clo_D1_cache,
+                                   &clo_LL_cache)) {
+   }
 
    else
-     return False;
+      return False;
 
-  return True;
+   return True;
 }
 
-static
-void cachesim_printstat(Int l1, Int l2, Int l3)
+static void cachesim_printstat(Int l1, Int l2, Int l3)
 {
-  FullCost total = TG_(total_cost), D_total = 0;
-  ULong LL_total_m, LL_total_mr, LL_total_mw,
-    LL_total, LL_total_r, LL_total_w;
-
-  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
-    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n", 
-		 prefetch_up);
-    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n", 
-		 prefetch_down);
-    VG_(message)(Vg_DebugMsg, "\n");
-  }
-
-  VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
-               total[fullOffset(EG_IR) +1]);
-
-  VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
-               total[fullOffset(EG_IR) +2]);
-
-  if (0 == total[fullOffset(EG_IR)])
-    total[fullOffset(EG_IR)] = 1;
-
-  VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
-               total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
-       
-  VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
-               total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
-
-  VG_(message)(Vg_UserMsg, "\n");
-   
-  /* D cache results.
-     Use the D_refs.rd and D_refs.wr values to determine the
-   * width of columns 2 & 3. */
-
-  D_total = TG_(get_eventset_cost)( TG_(sets).full );
-  TG_(init_cost)( TG_(sets).full, D_total);
-  // we only use the first 3 values of D_total, adding up Dr and Dw costs
-  TG_(copy_cost)( TG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
-  TG_(add_cost) ( TG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
-
-  VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
-               l1, D_total[0],
-               l2, total[fullOffset(EG_DR)],
-               l3, total[fullOffset(EG_DW)]);
-
-  VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
-               l1, D_total[1],
-               l2, total[fullOffset(EG_DR)+1],
-               l3, total[fullOffset(EG_DW)+1]);
-
-  VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
-               l1, D_total[2],
-               l2, total[fullOffset(EG_DR)+2],
-               l3, total[fullOffset(EG_DW)+2]);
-
-  if (0 == D_total[0])   D_total[0] = 1;
-  if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
-  if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
-  
-  VG_(message)(Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", 
-           l1, D_total[1] * 100.0 / D_total[0],
-           l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
-           l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
-  
-  VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", 
-           l1, D_total[2] * 100.0 / D_total[0],
-           l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
-           l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
-  VG_(message)(Vg_UserMsg, "\n");
-
-
-  
-  /* LL overall results */
-  
-  LL_total   =
-    total[fullOffset(EG_DR) +1] +
-    total[fullOffset(EG_DW) +1] +
-    total[fullOffset(EG_IR) +1];
-  LL_total_r =
-    total[fullOffset(EG_DR) +1] +
-    total[fullOffset(EG_IR) +1];
-  LL_total_w = total[fullOffset(EG_DW) +1];
-  VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
-               l1, LL_total, l2, LL_total_r, l3, LL_total_w);
-  
-  LL_total_m  =
-    total[fullOffset(EG_DR) +2] +
-    total[fullOffset(EG_DW) +2] +
-    total[fullOffset(EG_IR) +2];
-  LL_total_mr =
-    total[fullOffset(EG_DR) +2] +
-    total[fullOffset(EG_IR) +2];
-  LL_total_mw = total[fullOffset(EG_DW) +2];
-  VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
-               l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
-  
-  VG_(message)(Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
-          l1, LL_total_m  * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
-          l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
-          l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
-}
+   FullCost total = TG_(total_cost), D_total = 0;
+   ULong LL_total_m, LL_total_mr, LL_total_mw, LL_total, LL_total_r, LL_total_w;
 
+   if ((VG_(clo_verbosity) > 1) && clo_simulate_hwpref) {
+      VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n", prefetch_up);
+      VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n", prefetch_down);
+      VG_(message)(Vg_DebugMsg, "\n");
+   }
+
+   VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
+                total[fullOffset(EG_IR) + 1]);
+
+   VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
+                total[fullOffset(EG_IR) + 2]);
+
+   if (0 == total[fullOffset(EG_IR)])
+      total[fullOffset(EG_IR)] = 1;
+
+   VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
+                total[fullOffset(EG_IR) + 1] * 100.0 /
+                   total[fullOffset(EG_IR)]);
+
+   VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
+                total[fullOffset(EG_IR) + 2] * 100.0 /
+                   total[fullOffset(EG_IR)]);
+
+   VG_(message)(Vg_UserMsg, "\n");
+
+   /* D cache results.
+      Use the D_refs.rd and D_refs.wr values to determine the
+    * width of columns 2 & 3. */
+
+   D_total = TG_(get_eventset_cost)(TG_(sets).full);
+   TG_(init_cost)(TG_(sets).full, D_total);
+   // we only use the first 3 values of D_total, adding up Dr and Dw costs
+   TG_(copy_cost)
+   (TG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR));
+   TG_(add_cost)(TG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW));
+
+   VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, D_total[0], l2, total[fullOffset(EG_DR)], l3,
+                total[fullOffset(EG_DW)]);
+
+   VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, D_total[1], l2, total[fullOffset(EG_DR) + 1], l3,
+                total[fullOffset(EG_DW) + 1]);
+
+   VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, D_total[2], l2, total[fullOffset(EG_DR) + 2], l3,
+                total[fullOffset(EG_DW) + 2]);
+
+   if (0 == D_total[0])
+      D_total[0] = 1;
+   if (0 == total[fullOffset(EG_DR)])
+      total[fullOffset(EG_DR)] = 1;
+   if (0 == total[fullOffset(EG_DW)])
+      total[fullOffset(EG_DW)] = 1;
+
+   VG_(message)(
+      Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", l1,
+      D_total[1] * 100.0 / D_total[0], l2,
+      total[fullOffset(EG_DR) + 1] * 100.0 / total[fullOffset(EG_DR)], l3,
+      total[fullOffset(EG_DW) + 1] * 100.0 / total[fullOffset(EG_DW)]);
+
+   VG_(message)(
+      Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", l1,
+      D_total[2] * 100.0 / D_total[0], l2,
+      total[fullOffset(EG_DR) + 2] * 100.0 / total[fullOffset(EG_DR)], l3,
+      total[fullOffset(EG_DW) + 2] * 100.0 / total[fullOffset(EG_DW)]);
+   VG_(message)(Vg_UserMsg, "\n");
+
+   /* LL overall results */
+
+   LL_total = total[fullOffset(EG_DR) + 1] + total[fullOffset(EG_DW) + 1] +
+              total[fullOffset(EG_IR) + 1];
+   LL_total_r = total[fullOffset(EG_DR) + 1] + total[fullOffset(EG_IR) + 1];
+   LL_total_w = total[fullOffset(EG_DW) + 1];
+   VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, LL_total, l2, LL_total_r, l3, LL_total_w);
+
+   LL_total_m = total[fullOffset(EG_DR) + 2] + total[fullOffset(EG_DW) + 2] +
+                total[fullOffset(EG_IR) + 2];
+   LL_total_mr = total[fullOffset(EG_DR) + 2] + total[fullOffset(EG_IR) + 2];
+   LL_total_mw = total[fullOffset(EG_DW) + 2];
+   VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
+
+   VG_(message)(
+      Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n", l1,
+      LL_total_m * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]), l2,
+      LL_total_mr * 100.0 /
+         (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
+      l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
+}
 
 /*------------------------------------------------------------*/
 /*--- Setup for Event set.                                 ---*/
@@ -1584,91 +1579,89 @@ struct event_sets TG_(sets);
 
 void TG_(init_eventsets)(void)
 {
-    // Event groups from which the event sets are composed
-    // the "Use" group only is used with "cacheuse" simulation
-    if (clo_collect_cacheuse)
-	TG_(register_event_group4)(EG_USE,
-				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
-
-    if (!TG_(clo).simulate_cache)
-	TG_(register_event_group)(EG_IR, "Ir");
-    else if (!clo_simulate_writeback) {
-	TG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
-	TG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
-	TG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
-    }
-    else { // clo_simulate_writeback
-	TG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
-        TG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
-        TG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
-    }
-
-    if (TG_(clo).simulate_branch) {
-        TG_(register_event_group2)(EG_BC, "Bc", "Bcm");
-        TG_(register_event_group2)(EG_BI, "Bi", "Bim");
-    }
-
-    if (TG_(clo).collect_bus)
-	TG_(register_event_group)(EG_BUS, "Ge");
-
-    if (TG_(clo).collect_systime != systime_no) {
-       if (TG_(clo).collect_systime == systime_nsec)
-          TG_(register_event_group3)(EG_SYS, "sysCount", "sysTime", "sysCpuTime");
-       else
-          TG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
-    }
-
-    // event set used as base for instruction self cost
-    TG_(sets).base = TG_(get_event_set2)(EG_USE, EG_IR);
-
-    // event set comprising all event groups, used for inclusive cost
-    TG_(sets).full = TG_(add_event_group2)(TG_(sets).base, EG_DR, EG_DW);
-    TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_BC, EG_BI);
-    TG_(sets).full = TG_(add_event_group) (TG_(sets).full, EG_BUS);
-    TG_(sets).full = TG_(add_event_group) (TG_(sets).full, EG_SYS);
-
-    TG_DEBUGIF(1) {
-	TG_DEBUG(1, "EventSets:\n");
-	TG_(print_eventset)(-2, TG_(sets).base);
-	TG_(print_eventset)(-2, TG_(sets).full);
-    }
-
-    /* Not-existing events are silently ignored */
-    TG_(dumpmap) = TG_(get_eventmapping)(TG_(sets).full);
-    TG_(append_event)(TG_(dumpmap), "Ir");
-    TG_(append_event)(TG_(dumpmap), "Dr");
-    TG_(append_event)(TG_(dumpmap), "Dw");
-    TG_(append_event)(TG_(dumpmap), "I1mr");
-    TG_(append_event)(TG_(dumpmap), "D1mr");
-    TG_(append_event)(TG_(dumpmap), "D1mw");
-    TG_(append_event)(TG_(dumpmap), "ILmr");
-    TG_(append_event)(TG_(dumpmap), "DLmr");
-    TG_(append_event)(TG_(dumpmap), "DLmw");
-    TG_(append_event)(TG_(dumpmap), "ILdmr");
-    TG_(append_event)(TG_(dumpmap), "DLdmr");
-    TG_(append_event)(TG_(dumpmap), "DLdmw");
-    TG_(append_event)(TG_(dumpmap), "Bc");
-    TG_(append_event)(TG_(dumpmap), "Bcm");
-    TG_(append_event)(TG_(dumpmap), "Bi");
-    TG_(append_event)(TG_(dumpmap), "Bim");
-    TG_(append_event)(TG_(dumpmap), "AcCost1");
-    TG_(append_event)(TG_(dumpmap), "SpLoss1");
-    TG_(append_event)(TG_(dumpmap), "AcCost2");
-    TG_(append_event)(TG_(dumpmap), "SpLoss2");
-    TG_(append_event)(TG_(dumpmap), "Ge");
-    TG_(append_event)(TG_(dumpmap), "allocCount");
-    TG_(append_event)(TG_(dumpmap), "allocSize");
-    TG_(append_event)(TG_(dumpmap), "sysCount");
-    TG_(append_event)(TG_(dumpmap), "sysTime");
-    TG_(append_event)(TG_(dumpmap), "sysCpuTime");
-}
+   // Event groups from which the event sets are composed
+   // the "Use" group only is used with "cacheuse" simulation
+   if (clo_collect_cacheuse)
+      TG_(register_event_group4)
+   (EG_USE, "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
+
+   if (!TG_(clo).simulate_cache)
+      TG_(register_event_group)(EG_IR, "Ir");
+   else if (!clo_simulate_writeback) {
+      TG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
+      TG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
+      TG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
+   } else { // clo_simulate_writeback
+      TG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
+      TG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
+      TG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
+   }
+
+   if (TG_(clo).simulate_branch) {
+      TG_(register_event_group2)(EG_BC, "Bc", "Bcm");
+      TG_(register_event_group2)(EG_BI, "Bi", "Bim");
+   }
+
+   if (TG_(clo).collect_bus)
+      TG_(register_event_group)(EG_BUS, "Ge");
 
+   if (TG_(clo).collect_systime != systime_no) {
+      if (TG_(clo).collect_systime == systime_nsec)
+         TG_(register_event_group3)
+      (EG_SYS, "sysCount", "sysTime", "sysCpuTime");
+      else TG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
+   }
+
+   // event set used as base for instruction self cost
+   TG_(sets).base = TG_(get_event_set2)(EG_USE, EG_IR);
+
+   // event set comprising all event groups, used for inclusive cost
+   TG_(sets).full = TG_(add_event_group2)(TG_(sets).base, EG_DR, EG_DW);
+   TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_BC, EG_BI);
+   TG_(sets).full = TG_(add_event_group)(TG_(sets).full, EG_BUS);
+   TG_(sets).full = TG_(add_event_group)(TG_(sets).full, EG_SYS);
+
+   TG_DEBUGIF(1)
+   {
+      TG_DEBUG(1, "EventSets:\n");
+      TG_(print_eventset)(-2, TG_(sets).base);
+      TG_(print_eventset)(-2, TG_(sets).full);
+   }
 
-static
-void cachesim_finish(void)
+   /* Not-existing events are silently ignored */
+   TG_(dumpmap) = TG_(get_eventmapping)(TG_(sets).full);
+   TG_(append_event)(TG_(dumpmap), "Ir");
+   TG_(append_event)(TG_(dumpmap), "Dr");
+   TG_(append_event)(TG_(dumpmap), "Dw");
+   TG_(append_event)(TG_(dumpmap), "I1mr");
+   TG_(append_event)(TG_(dumpmap), "D1mr");
+   TG_(append_event)(TG_(dumpmap), "D1mw");
+   TG_(append_event)(TG_(dumpmap), "ILmr");
+   TG_(append_event)(TG_(dumpmap), "DLmr");
+   TG_(append_event)(TG_(dumpmap), "DLmw");
+   TG_(append_event)(TG_(dumpmap), "ILdmr");
+   TG_(append_event)(TG_(dumpmap), "DLdmr");
+   TG_(append_event)(TG_(dumpmap), "DLdmw");
+   TG_(append_event)(TG_(dumpmap), "Bc");
+   TG_(append_event)(TG_(dumpmap), "Bcm");
+   TG_(append_event)(TG_(dumpmap), "Bi");
+   TG_(append_event)(TG_(dumpmap), "Bim");
+   TG_(append_event)(TG_(dumpmap), "AcCost1");
+   TG_(append_event)(TG_(dumpmap), "SpLoss1");
+   TG_(append_event)(TG_(dumpmap), "AcCost2");
+   TG_(append_event)(TG_(dumpmap), "SpLoss2");
+   TG_(append_event)(TG_(dumpmap), "Ge");
+   TG_(append_event)(TG_(dumpmap), "allocCount");
+   TG_(append_event)(TG_(dumpmap), "allocSize");
+   TG_(append_event)(TG_(dumpmap), "sysCount");
+   TG_(append_event)(TG_(dumpmap), "sysTime");
+   TG_(append_event)(TG_(dumpmap), "sysCpuTime");
+}
+
+static void cachesim_finish(void)
 {
-  if (clo_collect_cacheuse)
-    cacheuse_finish();
+   if (clo_collect_cacheuse)
+      cacheuse_finish();
 }
 
 /*------------------------------------------------------------*/
@@ -1676,36 +1669,35 @@ void cachesim_finish(void)
 /*------------------------------------------------------------*/
 
 struct cachesim_if TG_(cachesim) = {
-  .print_opts    = cachesim_print_opts,
-  .parse_opt     = cachesim_parse_opt,
-  .post_clo_init = cachesim_post_clo_init,
-  .clear         = cachesim_clear,
-  .printstat     = cachesim_printstat,
-  .finish        = cachesim_finish,
+   .print_opts    = cachesim_print_opts,
+   .parse_opt     = cachesim_parse_opt,
+   .post_clo_init = cachesim_post_clo_init,
+   .clear         = cachesim_clear,
+   .printstat     = cachesim_printstat,
+   .finish        = cachesim_finish,
 
-  /* these will be set by cachesim_post_clo_init */
-  .log_1I0D        = 0,
-  .log_2I0D        = 0,
-  .log_3I0D        = 0,
+   /* these will be set by cachesim_post_clo_init */
+   .log_1I0D = 0,
+   .log_2I0D = 0,
+   .log_3I0D = 0,
 
-  .log_1I1Dr       = 0,
-  .log_1I1Dw       = 0,
+   .log_1I1Dr = 0,
+   .log_1I1Dw = 0,
 
-  .log_0I1Dr       = 0,
-  .log_0I1Dw       = 0,
+   .log_0I1Dr = 0,
+   .log_0I1Dw = 0,
 
-  .log_1I0D_name = "(no function)",
-  .log_2I0D_name = "(no function)",
-  .log_3I0D_name = "(no function)",
+   .log_1I0D_name = "(no function)",
+   .log_2I0D_name = "(no function)",
+   .log_3I0D_name = "(no function)",
 
-  .log_1I1Dr_name = "(no function)",
-  .log_1I1Dw_name = "(no function)",
+   .log_1I1Dr_name = "(no function)",
+   .log_1I1Dw_name = "(no function)",
 
-  .log_0I1Dr_name = "(no function)",
-  .log_0I1Dw_name = "(no function)",
+   .log_0I1Dr_name = "(no function)",
+   .log_0I1Dw_name = "(no function)",
 };
 
-
 /*--------------------------------------------------------------------*/
 /*--- end                                                 ct_sim.c ---*/
 /*--------------------------------------------------------------------*/
diff --git a/tracegrind/tests/test_basic.c b/tracegrind/tests/test_basic.c
index 971ad25c7..2dddef620 100644
--- a/tracegrind/tests/test_basic.c
+++ b/tracegrind/tests/test_basic.c
@@ -1,11 +1,14 @@
 #include "tracegrind.h"
 
-static int factorial(int n) {
-    if (n <= 1) return 1;
-    return n * factorial(n - 1);
+static int factorial(int n)
+{
+   if (n <= 1)
+      return 1;
+   return n * factorial(n - 1);
 }
 
-int main(void) {
-    int result = factorial(5);
-    return result != 120;
+int main(void)
+{
+   int result = factorial(5);
+   return result != 120;
 }
diff --git a/tracegrind/tests/test_enter_inlined.c b/tracegrind/tests/test_enter_inlined.c
index 7ab5593eb..70aa99e84 100644
--- a/tracegrind/tests/test_enter_inlined.c
+++ b/tracegrind/tests/test_enter_inlined.c
@@ -2,31 +2,34 @@
 
 /* Force inlining - with --read-inline-info=yes these should produce
  * ENTER_INLINED / EXIT_INLINED events in the trace */
-static inline __attribute__((always_inline)) int inlined_work(int a, int b) {
-    /* Make the function large enough to span multiple basic blocks
-     * so at least one BB boundary falls inside inlined code */
-    int result = 0;
-    if (a > 0) {
-        result = a * b;
-    } else {
-        result = a + b;
-    }
-    return result;
+static inline __attribute__((always_inline)) int inlined_work(int a, int b)
+{
+   /* Make the function large enough to span multiple basic blocks
+    * so at least one BB boundary falls inside inlined code */
+   int result = 0;
+   if (a > 0) {
+      result = a * b;
+   } else {
+      result = a + b;
+   }
+   return result;
 }
 
 /* Prevent inlining - SHOULD appear as ENTER/EXIT */
-static int __attribute__((noinline)) not_inlined_caller(int n) {
-    /* Use volatile to prevent constant propagation */
-    volatile int x = n;
-    return inlined_work(x, x + 1);
+static int __attribute__((noinline)) not_inlined_caller(int n)
+{
+   /* Use volatile to prevent constant propagation */
+   volatile int x = n;
+   return inlined_work(x, x + 1);
 }
 
-int main(void) {
-    volatile int input = 3;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = not_inlined_caller(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return result != 12;
+int main(void)
+{
+   volatile int input = 3;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = not_inlined_caller(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 12;
 }
diff --git a/tracegrind/tests/test_exception.cpp b/tracegrind/tests/test_exception.cpp
index 8b791728a..b9b599bdd 100644
--- a/tracegrind/tests/test_exception.cpp
+++ b/tracegrind/tests/test_exception.cpp
@@ -12,31 +12,35 @@
  * Call chain:  catcher -> thrower -> do_throw (throws)
  */
 
-static void __attribute__((noinline)) do_throw(int x) {
-    if (x > 0)
-        throw std::runtime_error("boom");
+static void __attribute__((noinline)) do_throw(int x)
+{
+   if (x > 0)
+      throw std::runtime_error("boom");
 }
 
-static int __attribute__((noinline)) thrower(int n) {
-    volatile int x = n;
-    do_throw(x);
-    return x;
+static int __attribute__((noinline)) thrower(int n)
+{
+   volatile int x = n;
+   do_throw(x);
+   return x;
 }
 
-static int __attribute__((noinline)) catcher(int n) {
-    try {
-        return thrower(n);
-    } catch (const std::exception&) {
-        return -1;
-    }
+static int __attribute__((noinline)) catcher(int n)
+{
+   try {
+      return thrower(n);
+   } catch (const std::exception&) {
+      return -1;
+   }
 }
 
-int main() {
-    volatile int input = 5;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = catcher(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return result != -1;
+int main()
+{
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = catcher(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != -1;
 }
diff --git a/tracegrind/tests/test_foo_bar_baz.c b/tracegrind/tests/test_foo_bar_baz.c
index e9f7a6783..f4f2560f4 100644
--- a/tracegrind/tests/test_foo_bar_baz.c
+++ b/tracegrind/tests/test_foo_bar_baz.c
@@ -1,23 +1,18 @@
 #include "tracegrind.h"
 
-static int __attribute__((noinline)) baz(int n) {
-    return n * 2;
-}
+static int __attribute__((noinline)) baz(int n) { return n * 2; }
 
-static int __attribute__((noinline)) bar(int n) {
-    return baz(n) + 1;
-}
+static int __attribute__((noinline)) bar(int n) { return baz(n) + 1; }
 
-static int __attribute__((noinline)) foo(int n) {
-    return bar(n) + bar(n + 1);
-}
+static int __attribute__((noinline)) foo(int n) { return bar(n) + bar(n + 1); }
 
-int main(void) {
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = foo(3);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
+int main(void)
+{
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = foo(3);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
 
-    return result != (baz(3) + 1 + baz(4) + 1);
+   return result != (baz(3) + 1 + baz(4) + 1);
 }
diff --git a/tracegrind/tests/test_inline.c b/tracegrind/tests/test_inline.c
index fb295f73f..0533ee592 100644
--- a/tracegrind/tests/test_inline.c
+++ b/tracegrind/tests/test_inline.c
@@ -1,25 +1,29 @@
 #include "tracegrind.h"
 
 /* Force inlining - these should NOT appear as ENTER/EXIT in the trace */
-static inline __attribute__((always_inline)) int inlined_add(int a, int b) {
-    return a + b;
+static inline __attribute__((always_inline)) int inlined_add(int a, int b)
+{
+   return a + b;
 }
 
-static inline __attribute__((always_inline)) int inlined_mul(int a, int b) {
-    return a * b;
+static inline __attribute__((always_inline)) int inlined_mul(int a, int b)
+{
+   return a * b;
 }
 
 /* Prevent inlining - these SHOULD appear as ENTER/EXIT in the trace */
-static int __attribute__((noinline)) not_inlined_work(int n) {
-    return inlined_add(n, inlined_mul(n, 2));
+static int __attribute__((noinline)) not_inlined_work(int n)
+{
+   return inlined_add(n, inlined_mul(n, 2));
 }
 
-int main(void) {
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = not_inlined_work(5);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
+int main(void)
+{
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = not_inlined_work(5);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
 
-    return result != 15;
+   return result != 15;
 }
diff --git a/tracegrind/tests/test_instr_toggle.c b/tracegrind/tests/test_instr_toggle.c
index e47767d2c..07d5f46f8 100644
--- a/tracegrind/tests/test_instr_toggle.c
+++ b/tracegrind/tests/test_instr_toggle.c
@@ -1,18 +1,21 @@
 #include "tracegrind.h"
 
-static int __attribute__((noinline)) fibo(int n) {
-    if (n <= 1) return n;
-    return fibo(n - 1) + fibo(n - 2);
+static int __attribute__((noinline)) fibo(int n)
+{
+   if (n <= 1)
+      return n;
+   return fibo(n - 1) + fibo(n - 2);
 }
 
-int main(void) {
-    /* Instrumentation is off (--instr-atstart=no).
-       Only the fibo(2) call will be traced. */
-    TRACEGRIND_ADD_MARKER("before-fibo");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = fibo(2);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("after-fibo");
+int main(void)
+{
+   /* Instrumentation is off (--instr-atstart=no).
+      Only the fibo(2) call will be traced. */
+   TRACEGRIND_ADD_MARKER("before-fibo");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = fibo(2);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("after-fibo");
 
-    return result != 1;
+   return result != 1;
 }
diff --git a/tracegrind/tests/test_longjmp.c b/tracegrind/tests/test_longjmp.c
index ffca450f8..5659431b2 100644
--- a/tracegrind/tests/test_longjmp.c
+++ b/tracegrind/tests/test_longjmp.c
@@ -13,35 +13,39 @@
 
 static jmp_buf env;
 
-static void __attribute__((noinline)) inner(int n) {
-    volatile int x = n * 2;
-    (void)x;
-    longjmp(env, 42);
+static void __attribute__((noinline)) inner(int n)
+{
+   volatile int x = n * 2;
+   (void)x;
+   longjmp(env, 42);
 }
 
-static void __attribute__((noinline)) middle(int n) {
-    volatile int x = n + 1;
-    inner(x);
-    /* never reached */
-    x = x + 1;
+static void __attribute__((noinline)) middle(int n)
+{
+   volatile int x = n + 1;
+   inner(x);
+   /* never reached */
+   x = x + 1;
 }
 
-static int __attribute__((noinline)) outer(int n) {
-    int val = setjmp(env);
-    if (val == 0) {
-        middle(n);
-        /* never reached */
-        return -1;
-    }
-    return val;
+static int __attribute__((noinline)) outer(int n)
+{
+   int val = setjmp(env);
+   if (val == 0) {
+      middle(n);
+      /* never reached */
+      return -1;
+   }
+   return val;
 }
 
-int main(void) {
-    volatile int input = 5;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = outer(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return result != 42;
+int main(void)
+{
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = outer(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 42;
 }
diff --git a/tracegrind/tests/test_marker.c b/tracegrind/tests/test_marker.c
index 76a5d72cc..721883b3b 100644
--- a/tracegrind/tests/test_marker.c
+++ b/tracegrind/tests/test_marker.c
@@ -1,15 +1,17 @@
 #include "tracegrind.h"
 
-static int compute(int n) {
-    int sum = 0;
-    for (int i = 0; i < n; i++)
-        sum += i * i;
-    return sum;
+static int compute(int n)
+{
+   int sum = 0;
+   for (int i = 0; i < n; i++)
+      sum += i * i;
+   return sum;
 }
 
-int main(void) {
-    TRACEGRIND_ADD_MARKER("start-work");
-    int result = compute(1000);
-    TRACEGRIND_ADD_MARKER("end-work");
-    return result == 0;
+int main(void)
+{
+   TRACEGRIND_ADD_MARKER("start-work");
+   int result = compute(1000);
+   TRACEGRIND_ADD_MARKER("end-work");
+   return result == 0;
 }
diff --git a/tracegrind/tests/test_nested_inlined.c b/tracegrind/tests/test_nested_inlined.c
index 2c1ca6c33..a0daca1e1 100644
--- a/tracegrind/tests/test_nested_inlined.c
+++ b/tracegrind/tests/test_nested_inlined.c
@@ -3,14 +3,15 @@
 /* Inner inlined function.
  * With --read-inline-info=yes, should produce ENTER_INLINED / EXIT_INLINED
  * events with fn=inner_inline. */
-static inline __attribute__((always_inline)) int inner_inline(int a) {
-    int result;
-    if (a > 0) {
-        result = a * 3;
-    } else {
-        result = a + 1;
-    }
-    return result;
+static inline __attribute__((always_inline)) int inner_inline(int a)
+{
+   int result;
+   if (a > 0) {
+      result = a * 3;
+   } else {
+      result = a + 1;
+   }
+   return result;
 }
 
 /* Outer inlined function - calls inner_inline.
@@ -18,30 +19,33 @@ static inline __attribute__((always_inline)) int inner_inline(int a) {
  * showing nested inline transitions.
  * Uses volatile stores in both branches to prevent the compiler from
  * converting the if-else to a branchless cmov. */
-static inline __attribute__((always_inline)) int outer_inline(int a, int b) {
-    volatile int x;
-    if (a > b) {
-        x = a - b;
-    } else {
-        x = b - a;
-    }
-    int y = inner_inline(x);
-    return y + a;
+static inline __attribute__((always_inline)) int outer_inline(int a, int b)
+{
+   volatile int x;
+   if (a > b) {
+      x = a - b;
+   } else {
+      x = b - a;
+   }
+   int y = inner_inline(x);
+   return y + a;
 }
 
 /* Non-inlined caller */
-static int __attribute__((noinline)) caller(int n) {
-    volatile int x = n;
-    return outer_inline(x, x + 1);
+static int __attribute__((noinline)) caller(int n)
+{
+   volatile int x = n;
+   return outer_inline(x, x + 1);
 }
 
-int main(void) {
-    volatile int input = 5;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = caller(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    /* caller(5) -> outer_inline(5, 6): x=1, inner_inline(1)=3, 3+5=8 */
-    return result != 8;
+int main(void)
+{
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = caller(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   /* caller(5) -> outer_inline(5, 6): x=1, inner_inline(1)=3, 3+5=8 */
+   return result != 8;
 }
diff --git a/tracegrind/tests/test_recursion.c b/tracegrind/tests/test_recursion.c
index 0d96b0cca..e3589ae6c 100644
--- a/tracegrind/tests/test_recursion.c
+++ b/tracegrind/tests/test_recursion.c
@@ -8,19 +8,21 @@
  * produces balanced ENTER/EXIT pairs.
  */
 
-static int __attribute__((noinline)) recurse(int depth) {
-    volatile int d = depth;
-    if (d <= 0)
-        return 0;
-    return recurse(d - 1) + 1;
+static int __attribute__((noinline)) recurse(int depth)
+{
+   volatile int d = depth;
+   if (d <= 0)
+      return 0;
+   return recurse(d - 1) + 1;
 }
 
-int main(void) {
-    volatile int input = 100;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = recurse(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return result != 100;
+int main(void)
+{
+   volatile int input = 100;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = recurse(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 100;
 }
diff --git a/tracegrind/tests/test_signal.c b/tracegrind/tests/test_signal.c
index 3bfa48d0e..028354780 100644
--- a/tracegrind/tests/test_signal.c
+++ b/tracegrind/tests/test_signal.c
@@ -12,28 +12,31 @@
 
 static volatile sig_atomic_t got_signal = 0;
 
-static void __attribute__((noinline)) handler_fn(int sig) {
-    (void)sig;
-    got_signal = 1;
+static void __attribute__((noinline)) handler_fn(int sig)
+{
+   (void)sig;
+   got_signal = 1;
 }
 
-static int __attribute__((noinline)) caller(int n) {
-    volatile int x = n;
-    raise(SIGALRM);
-    return x + 1;
+static int __attribute__((noinline)) caller(int n)
+{
+   volatile int x = n;
+   raise(SIGALRM);
+   return x + 1;
 }
 
-int main(void) {
-    struct sigaction sa;
-    memset(&sa, 0, sizeof(sa));
-    sa.sa_handler = handler_fn;
-    sigaction(SIGALRM, &sa, NULL);
+int main(void)
+{
+   struct sigaction sa;
+   memset(&sa, 0, sizeof(sa));
+   sa.sa_handler = handler_fn;
+   sigaction(SIGALRM, &sa, NULL);
 
-    volatile int input = 5;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = caller(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return (result != 6) || !got_signal;
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = caller(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return (result != 6) || !got_signal;
 }
diff --git a/tracegrind/tests/test_syscall.c b/tracegrind/tests/test_syscall.c
index a3001f17b..9aac40a48 100644
--- a/tracegrind/tests/test_syscall.c
+++ b/tracegrind/tests/test_syscall.c
@@ -1,28 +1,29 @@
 #include "tracegrind.h"
-#include <unistd.h>
 #include <fcntl.h>
+#include <unistd.h>
 
-static int __attribute__((noinline)) do_getpid(void) {
-    return getpid();
-}
+static int __attribute__((noinline)) do_getpid(void) { return getpid(); }
 
-static void __attribute__((noinline)) do_write(int fd) {
-    const char msg[] = "hello\n";
-    write(fd, msg, sizeof(msg) - 1);
+static void __attribute__((noinline)) do_write(int fd)
+{
+   const char msg[] = "hello\n";
+   write(fd, msg, sizeof(msg) - 1);
 }
 
-static void __attribute__((noinline)) caller(int fd) {
-    do_getpid();
-    do_write(fd);
+static void __attribute__((noinline)) caller(int fd)
+{
+   do_getpid();
+   do_write(fd);
 }
 
-int main(void) {
-    int fd = open("/dev/null", O_WRONLY);
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    caller(fd);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    close(fd);
-    return 0;
+int main(void)
+{
+   int fd = open("/dev/null", O_WRONLY);
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   caller(fd);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   close(fd);
+   return 0;
 }
diff --git a/tracegrind/tests/test_tailcall.c b/tracegrind/tests/test_tailcall.c
index 4a2868e70..b5524c69d 100644
--- a/tracegrind/tests/test_tailcall.c
+++ b/tracegrind/tests/test_tailcall.c
@@ -10,24 +10,19 @@
  * Call chain:  chain_a --(tail call)--> chain_b --(tail call)--> chain_c
  */
 
-static int __attribute__((noinline)) chain_c(int n) {
-    return n + 3;
-}
+static int __attribute__((noinline)) chain_c(int n) { return n + 3; }
 
-static int __attribute__((noinline)) chain_b(int n) {
-    return chain_c(n + 2);
-}
+static int __attribute__((noinline)) chain_b(int n) { return chain_c(n + 2); }
 
-static int __attribute__((noinline)) chain_a(int n) {
-    return chain_b(n + 1);
-}
+static int __attribute__((noinline)) chain_a(int n) { return chain_b(n + 1); }
 
-int main(void) {
-    volatile int input = 10;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    int result = chain_a(input);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return result != 16;
+int main(void)
+{
+   volatile int input = 10;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = chain_a(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 16;
 }
diff --git a/tracegrind/tests/test_thread_create.c b/tracegrind/tests/test_thread_create.c
index f04b0b167..29b340691 100644
--- a/tracegrind/tests/test_thread_create.c
+++ b/tracegrind/tests/test_thread_create.c
@@ -1,18 +1,20 @@
 #include "tracegrind.h"
 #include <pthread.h>
 
-static void *thread_fn(void *arg) {
-    (void)arg;
-    return NULL;
+static void* thread_fn(void* arg)
+{
+   (void)arg;
+   return NULL;
 }
 
-int main(void) {
-    pthread_t t;
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
-    pthread_create(&t, NULL, thread_fn, NULL);
-    pthread_join(t, NULL);
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
-    return 0;
+int main(void)
+{
+   pthread_t t;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   pthread_create(&t, NULL, thread_fn, NULL);
+   pthread_join(t, NULL);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return 0;
 }
diff --git a/tracegrind/tests/test_thread_interleave.c b/tracegrind/tests/test_thread_interleave.c
index 94ef46f05..93efcec47 100644
--- a/tracegrind/tests/test_thread_interleave.c
+++ b/tracegrind/tests/test_thread_interleave.c
@@ -1,54 +1,54 @@
 #include "tracegrind.h"
 #include <pthread.h>
 
-__attribute__((noinline)) static void depth_a2(void) { }
+__attribute__((noinline)) static void depth_a2(void) {}
 
-__attribute__((noinline)) static void depth_a1(void) {
-    depth_a2();
-}
+__attribute__((noinline)) static void depth_a1(void) { depth_a2(); }
 
-__attribute__((noinline)) static void *work_a(void *arg) {
-    (void)arg;
-    depth_a1();
-    return NULL;
+__attribute__((noinline)) static void* work_a(void* arg)
+{
+   (void)arg;
+   depth_a1();
+   return NULL;
 }
 
-__attribute__((noinline)) static void depth_b1(void) { }
+__attribute__((noinline)) static void depth_b1(void) {}
 
-__attribute__((noinline)) static void *work_b(void *arg) {
-    (void)arg;
-    depth_b1();
-    return NULL;
+__attribute__((noinline)) static void* work_b(void* arg)
+{
+   (void)arg;
+   depth_b1();
+   return NULL;
 }
 
-__attribute__((noinline)) static void depth_c2(void) { }
+__attribute__((noinline)) static void depth_c2(void) {}
 
-__attribute__((noinline)) static void depth_c1(void) {
-    depth_c2();
-}
+__attribute__((noinline)) static void depth_c1(void) { depth_c2(); }
 
-__attribute__((noinline)) static void *work_c(void *arg) {
-    (void)arg;
-    depth_c1();
-    return NULL;
+__attribute__((noinline)) static void* work_c(void* arg)
+{
+   (void)arg;
+   depth_c1();
+   return NULL;
 }
 
-int main(void) {
-    pthread_t t1, t2, t3;
+int main(void)
+{
+   pthread_t t1, t2, t3;
 
-    TRACEGRIND_ADD_MARKER("start");
-    TRACEGRIND_START_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
 
-    pthread_create(&t1, NULL, work_a, NULL);
-    pthread_create(&t2, NULL, work_b, NULL);
-    pthread_create(&t3, NULL, work_c, NULL);
+   pthread_create(&t1, NULL, work_a, NULL);
+   pthread_create(&t2, NULL, work_b, NULL);
+   pthread_create(&t3, NULL, work_c, NULL);
 
-    pthread_join(t1, NULL);
-    pthread_join(t2, NULL);
-    pthread_join(t3, NULL);
+   pthread_join(t1, NULL);
+   pthread_join(t2, NULL);
+   pthread_join(t3, NULL);
 
-    TRACEGRIND_STOP_INSTRUMENTATION;
-    TRACEGRIND_ADD_MARKER("end");
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
 
-    return 0;
+   return 0;
 }
diff --git a/tracegrind/tests/test_toggle_collect.c b/tracegrind/tests/test_toggle_collect.c
index 4d7de4ceb..635caaacc 100644
--- a/tracegrind/tests/test_toggle_collect.c
+++ b/tracegrind/tests/test_toggle_collect.c
@@ -1,23 +1,25 @@
 #include "tracegrind.h"
 
-static int work(int n) {
-    int sum = 0;
-    for (int i = 0; i < n; i++)
-        sum += i;
-    return sum;
+static int work(int n)
+{
+   int sum = 0;
+   for (int i = 0; i < n; i++)
+      sum += i;
+   return sum;
 }
 
-int main(void) {
-    /* Collection on by default, do some traced work */
-    int result = work(10);
+int main(void)
+{
+   /* Collection on by default, do some traced work */
+   int result = work(10);
 
-    /* Toggle collection off */
-    TRACEGRIND_TOGGLE_COLLECT;
-    result += work(20);  /* not collected */
+   /* Toggle collection off */
+   TRACEGRIND_TOGGLE_COLLECT;
+   result += work(20); /* not collected */
 
-    /* Toggle collection back on */
-    TRACEGRIND_TOGGLE_COLLECT;
-    result += work(30);  /* collected again */
+   /* Toggle collection back on */
+   TRACEGRIND_TOGGLE_COLLECT;
+   result += work(30); /* collected again */
 
-    return result == 0;
+   return result == 0;
 }
diff --git a/tracegrind/tg_lz4.c b/tracegrind/tg_lz4.c
index 8d4e66531..6a6dd3bcc 100644
--- a/tracegrind/tg_lz4.c
+++ b/tracegrind/tg_lz4.c
@@ -6,8 +6,8 @@
  */
 
 #include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
 #include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
 #include "pub_tool_mallocfree.h"
 
 #include "tg_lz4.h"
@@ -49,19 +49,14 @@
 /*--- Memory allocation functions (LZ4_USER_MEMORY_FUNCTIONS) */
 /*------------------------------------------------------------*/
 
-void* LZ4_malloc(size_t s)
-{
-    return VG_(malloc)("tg.lz4", s);
-}
+void* LZ4_malloc(size_t s) { return VG_(malloc)("tg.lz4", s); }
 
-void* LZ4_calloc(size_t n, size_t s)
-{
-    return VG_(calloc)("tg.lz4", n, s);
-}
+void* LZ4_calloc(size_t n, size_t s) { return VG_(calloc)("tg.lz4", n, s); }
 
 void LZ4_free(void* p)
 {
-    if (p) VG_(free)(p);
+   if (p)
+      VG_(free)(p);
 }
 
 /*------------------------------------------------------------*/
@@ -80,17 +75,18 @@ void LZ4_free(void* p)
 
 SizeT tg_lz4_compress_bound(SizeT src_size)
 {
-    return LZ4_compressBound((int)src_size);
+   return LZ4_compressBound((int)src_size);
 }
 
-SizeT tg_lz4_compress(void* dst, SizeT dst_capacity,
-                      const void* src, SizeT src_size)
+SizeT tg_lz4_compress(void*       dst,
+                      SizeT       dst_capacity,
+                      const void* src,
+                      SizeT       src_size)
 {
-    int result = LZ4_compress_fast((const char*)src, (char*)dst,
-                                   (int)src_size, (int)dst_capacity,
-                                   2 /* acceleration */);
-    if (result <= 0) {
-        return 0;
-    }
-    return (SizeT)result;
+   int result = LZ4_compress_fast((const char*)src, (char*)dst, (int)src_size,
+                                  (int)dst_capacity, 2 /* acceleration */);
+   if (result <= 0) {
+      return 0;
+   }
+   return (SizeT)result;
 }
diff --git a/tracegrind/tg_lz4.h b/tracegrind/tg_lz4.h
index 63a427501..7e127c0b2 100644
--- a/tracegrind/tg_lz4.h
+++ b/tracegrind/tg_lz4.h
@@ -15,7 +15,9 @@ SizeT tg_lz4_compress_bound(SizeT src_size);
  * dst_capacity must be >= tg_lz4_compress_bound(src_size).
  * Returns the compressed size on success, 0 on error.
  */
-SizeT tg_lz4_compress(void* dst, SizeT dst_capacity,
-                      const void* src, SizeT src_size);
+SizeT tg_lz4_compress(void*       dst,
+                      SizeT       dst_capacity,
+                      const void* src,
+                      SizeT       src_size);
 
 #endif /* TG_LZ4_H */
diff --git a/tracegrind/tg_msgpack.c b/tracegrind/tg_msgpack.c
index da8911307..aa202f739 100644
--- a/tracegrind/tg_msgpack.c
+++ b/tracegrind/tg_msgpack.c
@@ -6,8 +6,8 @@
  */
 
 #include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
 #include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
 #include "pub_tool_mallocfree.h"
 
 #include "tg_msgpack.h"
@@ -15,200 +15,196 @@
 /* Ensure at least `needed` bytes of capacity */
 static void msgpack_ensure(msgpack_buffer* mb, Int needed)
 {
-    if (mb->size + needed <= mb->capacity)
-        return;
-    Int new_cap = mb->capacity * 2;
-    if (new_cap < mb->size + needed)
-        new_cap = mb->size + needed;
-    mb->data = VG_(realloc)("tg.msgpack.buf", mb->data, new_cap);
-    mb->capacity = new_cap;
+   if (mb->size + needed <= mb->capacity)
+      return;
+   Int new_cap = mb->capacity * 2;
+   if (new_cap < mb->size + needed)
+      new_cap = mb->size + needed;
+   mb->data     = VG_(realloc)("tg.msgpack.buf", mb->data, new_cap);
+   mb->capacity = new_cap;
 }
 
 static void write_byte(msgpack_buffer* mb, UChar b)
 {
-    msgpack_ensure(mb, 1);
-    mb->data[mb->size++] = b;
+   msgpack_ensure(mb, 1);
+   mb->data[mb->size++] = b;
 }
 
 static void write_bytes(msgpack_buffer* mb, const void* data, Int len)
 {
-    msgpack_ensure(mb, len);
-    VG_(memcpy)(mb->data + mb->size, data, len);
-    mb->size += len;
+   msgpack_ensure(mb, len);
+   VG_(memcpy)(mb->data + mb->size, data, len);
+   mb->size += len;
 }
 
 /* Write big-endian integers */
 static void write_be16(msgpack_buffer* mb, UShort val)
 {
-    UChar buf[2];
-    buf[0] = (UChar)(val >> 8);
-    buf[1] = (UChar)(val);
-    write_bytes(mb, buf, 2);
+   UChar buf[2];
+   buf[0] = (UChar)(val >> 8);
+   buf[1] = (UChar)(val);
+   write_bytes(mb, buf, 2);
 }
 
 static void write_be32(msgpack_buffer* mb, UInt val)
 {
-    UChar buf[4];
-    buf[0] = (UChar)(val >> 24);
-    buf[1] = (UChar)(val >> 16);
-    buf[2] = (UChar)(val >> 8);
-    buf[3] = (UChar)(val);
-    write_bytes(mb, buf, 4);
+   UChar buf[4];
+   buf[0] = (UChar)(val >> 24);
+   buf[1] = (UChar)(val >> 16);
+   buf[2] = (UChar)(val >> 8);
+   buf[3] = (UChar)(val);
+   write_bytes(mb, buf, 4);
 }
 
 static void write_be64(msgpack_buffer* mb, ULong val)
 {
-    UChar buf[8];
-    buf[0] = (UChar)(val >> 56);
-    buf[1] = (UChar)(val >> 48);
-    buf[2] = (UChar)(val >> 40);
-    buf[3] = (UChar)(val >> 32);
-    buf[4] = (UChar)(val >> 24);
-    buf[5] = (UChar)(val >> 16);
-    buf[6] = (UChar)(val >> 8);
-    buf[7] = (UChar)(val);
-    write_bytes(mb, buf, 8);
+   UChar buf[8];
+   buf[0] = (UChar)(val >> 56);
+   buf[1] = (UChar)(val >> 48);
+   buf[2] = (UChar)(val >> 40);
+   buf[3] = (UChar)(val >> 32);
+   buf[4] = (UChar)(val >> 24);
+   buf[5] = (UChar)(val >> 16);
+   buf[6] = (UChar)(val >> 8);
+   buf[7] = (UChar)(val);
+   write_bytes(mb, buf, 8);
 }
 
 void msgpack_init(msgpack_buffer* mb, Int capacity)
 {
-    if (capacity < 256) capacity = 256;
-    mb->data = VG_(malloc)("tg.msgpack.init", capacity);
-    mb->size = 0;
-    mb->capacity = capacity;
+   if (capacity < 256)
+      capacity = 256;
+   mb->data     = VG_(malloc)("tg.msgpack.init", capacity);
+   mb->size     = 0;
+   mb->capacity = capacity;
 }
 
 void msgpack_free(msgpack_buffer* mb)
 {
-    if (mb->data) {
-        VG_(free)(mb->data);
-        mb->data = NULL;
-    }
-    mb->size = 0;
-    mb->capacity = 0;
+   if (mb->data) {
+      VG_(free)(mb->data);
+      mb->data = NULL;
+   }
+   mb->size     = 0;
+   mb->capacity = 0;
 }
 
-void msgpack_reset(msgpack_buffer* mb)
-{
-    mb->size = 0;
-}
+void msgpack_reset(msgpack_buffer* mb) { mb->size = 0; }
 
-void msgpack_write_nil(msgpack_buffer* mb)
-{
-    write_byte(mb, 0xc0);
-}
+void msgpack_write_nil(msgpack_buffer* mb) { write_byte(mb, 0xc0); }
 
 void msgpack_write_bool(msgpack_buffer* mb, Bool val)
 {
-    write_byte(mb, val ? 0xc3 : 0xc2);
+   write_byte(mb, val ? 0xc3 : 0xc2);
 }
 
 void msgpack_write_int(msgpack_buffer* mb, Long val)
 {
-    if (val >= 0) {
-        msgpack_write_uint(mb, (ULong)val);
-    } else if (val >= -32) {
-        /* negative fixint: 111xxxxx */
-        write_byte(mb, (UChar)(val & 0xff));
-    } else if (val >= -128) {
-        write_byte(mb, 0xd0); /* int8 */
-        write_byte(mb, (UChar)(val & 0xff));
-    } else if (val >= -32768) {
-        write_byte(mb, 0xd1); /* int16 */
-        write_be16(mb, (UShort)(val & 0xffff));
-    } else if (val >= -2147483648LL) {
-        write_byte(mb, 0xd2); /* int32 */
-        write_be32(mb, (UInt)(val & 0xffffffff));
-    } else {
-        write_byte(mb, 0xd3); /* int64 */
-        write_be64(mb, (ULong)val);
-    }
+   if (val >= 0) {
+      msgpack_write_uint(mb, (ULong)val);
+   } else if (val >= -32) {
+      /* negative fixint: 111xxxxx */
+      write_byte(mb, (UChar)(val & 0xff));
+   } else if (val >= -128) {
+      write_byte(mb, 0xd0); /* int8 */
+      write_byte(mb, (UChar)(val & 0xff));
+   } else if (val >= -32768) {
+      write_byte(mb, 0xd1); /* int16 */
+      write_be16(mb, (UShort)(val & 0xffff));
+   } else if (val >= -2147483648LL) {
+      write_byte(mb, 0xd2); /* int32 */
+      write_be32(mb, (UInt)(val & 0xffffffff));
+   } else {
+      write_byte(mb, 0xd3); /* int64 */
+      write_be64(mb, (ULong)val);
+   }
 }
 
 void msgpack_write_uint(msgpack_buffer* mb, ULong val)
 {
-    if (val <= 0x7f) {
-        /* positive fixint: 0xxxxxxx */
-        write_byte(mb, (UChar)val);
-    } else if (val <= 0xff) {
-        write_byte(mb, 0xcc); /* uint8 */
-        write_byte(mb, (UChar)val);
-    } else if (val <= 0xffff) {
-        write_byte(mb, 0xcd); /* uint16 */
-        write_be16(mb, (UShort)val);
-    } else if (val <= 0xffffffff) {
-        write_byte(mb, 0xce); /* uint32 */
-        write_be32(mb, (UInt)val);
-    } else {
-        write_byte(mb, 0xcf); /* uint64 */
-        write_be64(mb, val);
-    }
+   if (val <= 0x7f) {
+      /* positive fixint: 0xxxxxxx */
+      write_byte(mb, (UChar)val);
+   } else if (val <= 0xff) {
+      write_byte(mb, 0xcc); /* uint8 */
+      write_byte(mb, (UChar)val);
+   } else if (val <= 0xffff) {
+      write_byte(mb, 0xcd); /* uint16 */
+      write_be16(mb, (UShort)val);
+   } else if (val <= 0xffffffff) {
+      write_byte(mb, 0xce); /* uint32 */
+      write_be32(mb, (UInt)val);
+   } else {
+      write_byte(mb, 0xcf); /* uint64 */
+      write_be64(mb, val);
+   }
 }
 
 void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len)
 {
-    if (len < 0) len = VG_(strlen)(str);
-
-    if (len <= 31) {
-        /* fixstr: 101xxxxx */
-        write_byte(mb, (UChar)(0xa0 | len));
-    } else if (len <= 0xff) {
-        write_byte(mb, 0xd9); /* str8 */
-        write_byte(mb, (UChar)len);
-    } else if (len <= 0xffff) {
-        write_byte(mb, 0xda); /* str16 */
-        write_be16(mb, (UShort)len);
-    } else {
-        write_byte(mb, 0xdb); /* str32 */
-        write_be32(mb, (UInt)len);
-    }
-    write_bytes(mb, str, len);
+   if (len < 0)
+      len = VG_(strlen)(str);
+
+   if (len <= 31) {
+      /* fixstr: 101xxxxx */
+      write_byte(mb, (UChar)(0xa0 | len));
+   } else if (len <= 0xff) {
+      write_byte(mb, 0xd9); /* str8 */
+      write_byte(mb, (UChar)len);
+   } else if (len <= 0xffff) {
+      write_byte(mb, 0xda); /* str16 */
+      write_be16(mb, (UShort)len);
+   } else {
+      write_byte(mb, 0xdb); /* str32 */
+      write_be32(mb, (UInt)len);
+   }
+   write_bytes(mb, str, len);
 }
 
 void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len)
 {
-    if (len <= 0xff) {
-        write_byte(mb, 0xc4); /* bin8 */
-        write_byte(mb, (UChar)len);
-    } else if (len <= 0xffff) {
-        write_byte(mb, 0xc5); /* bin16 */
-        write_be16(mb, (UShort)len);
-    } else {
-        write_byte(mb, 0xc6); /* bin32 */
-        write_be32(mb, (UInt)len);
-    }
-    write_bytes(mb, data, len);
+   if (len <= 0xff) {
+      write_byte(mb, 0xc4); /* bin8 */
+      write_byte(mb, (UChar)len);
+   } else if (len <= 0xffff) {
+      write_byte(mb, 0xc5); /* bin16 */
+      write_be16(mb, (UShort)len);
+   } else {
+      write_byte(mb, 0xc6); /* bin32 */
+      write_be32(mb, (UInt)len);
+   }
+   write_bytes(mb, data, len);
 }
 
 void msgpack_write_array_header(msgpack_buffer* mb, UInt count)
 {
-    if (count <= 15) {
-        /* fixarray: 1001xxxx */
-        write_byte(mb, (UChar)(0x90 | count));
-    } else if (count <= 0xffff) {
-        write_byte(mb, 0xdc); /* array16 */
-        write_be16(mb, (UShort)count);
-    } else {
-        write_byte(mb, 0xdd); /* array32 */
-        write_be32(mb, count);
-    }
+   if (count <= 15) {
+      /* fixarray: 1001xxxx */
+      write_byte(mb, (UChar)(0x90 | count));
+   } else if (count <= 0xffff) {
+      write_byte(mb, 0xdc); /* array16 */
+      write_be16(mb, (UShort)count);
+   } else {
+      write_byte(mb, 0xdd); /* array32 */
+      write_be32(mb, count);
+   }
 }
 
 void msgpack_write_map_header(msgpack_buffer* mb, UInt count)
 {
-    if (count <= 15) {
-        /* fixmap: 1000xxxx */
-        write_byte(mb, (UChar)(0x80 | count));
-    } else if (count <= 0xffff) {
-        write_byte(mb, 0xde); /* map16 */
-        write_be16(mb, (UShort)count);
-    } else {
-        write_byte(mb, 0xdf); /* map32 */
-        write_be32(mb, count);
-    }
+   if (count <= 15) {
+      /* fixmap: 1000xxxx */
+      write_byte(mb, (UChar)(0x80 | count));
+   } else if (count <= 0xffff) {
+      write_byte(mb, 0xde); /* map16 */
+      write_be16(mb, (UShort)count);
+   } else {
+      write_byte(mb, 0xdf); /* map32 */
+      write_be32(mb, count);
+   }
 }
 
 void msgpack_write_key(msgpack_buffer* mb, const HChar* key)
 {
-    msgpack_write_str(mb, key, -1);
+   msgpack_write_str(mb, key, -1);
 }
diff --git a/tracegrind/tg_msgpack.h b/tracegrind/tg_msgpack.h
index e04d317bc..ae447970b 100644
--- a/tracegrind/tg_msgpack.h
+++ b/tracegrind/tg_msgpack.h
@@ -9,9 +9,9 @@
 #include "pub_tool_basics.h"
 
 typedef struct {
-    UChar* data;
-    Int    size;
-    Int    capacity;
+   UChar* data;
+   Int    size;
+   Int    capacity;
 } msgpack_buffer;
 
 void msgpack_init(msgpack_buffer* mb, Int capacity);
diff --git a/tracegrind/threads.c b/tracegrind/threads.c
index 734bf3f48..eaac68851 100644
--- a/tracegrind/threads.c
+++ b/tracegrind/threads.c
@@ -36,12 +36,10 @@ static exec_state* top_exec_state(void);
 
 static exec_stack current_states;
 
-
 /*------------------------------------------------------------*/
 /*--- Support for multi-threading                          ---*/
 /*------------------------------------------------------------*/
 
-
 /*
  * For Valgrind, MT is cooperative (no preemting in our code),
  * so we don't need locks...
@@ -61,150 +59,142 @@ ThreadId TG_(current_tid);
 
 static thread_info** thread;
 
-thread_info** TG_(get_threads)(void)
-{
-  return thread;
-}
+thread_info** TG_(get_threads)(void) { return thread; }
 
-thread_info* TG_(get_current_thread)(void)
-{
-  return thread[TG_(current_tid)];
-}
+thread_info* TG_(get_current_thread)(void) { return thread[TG_(current_tid)]; }
 
 void TG_(init_threads)(void)
 {
-    UInt i;
+   UInt i;
 
-    thread = TG_MALLOC("cl.threads.it.1", VG_N_THREADS * sizeof thread[0]);
+   thread = TG_MALLOC("cl.threads.it.1", VG_N_THREADS * sizeof thread[0]);
 
-    for(i=0;i<VG_N_THREADS;i++)
-	thread[i] = 0;
-    TG_(current_tid) = VG_INVALID_THREADID;
+   for (i = 0; i < VG_N_THREADS; i++)
+      thread[i] = 0;
+   TG_(current_tid) = VG_INVALID_THREADID;
 }
 
 /* switches through all threads and calls func */
 void TG_(forall_threads)(void (*func)(thread_info*))
 {
-  Int t, orig_tid = TG_(current_tid);
-
-  for(t=1;t<VG_N_THREADS;t++) {
-    if (!thread[t]) continue;
-    TG_(switch_thread)(t);
-    (*func)(thread[t]);
-  }
-  TG_(switch_thread)(orig_tid);
+   Int t, orig_tid = TG_(current_tid);
+
+   for (t = 1; t < VG_N_THREADS; t++) {
+      if (!thread[t])
+         continue;
+      TG_(switch_thread)(t);
+      (*func)(thread[t]);
+   }
+   TG_(switch_thread)(orig_tid);
 }
 
-
-static
-thread_info* new_thread(void)
+static thread_info* new_thread(void)
 {
-    thread_info* t;
-
-    t = (thread_info*) TG_MALLOC("cl.threads.nt.1",
-                                  sizeof(thread_info));
-
-    /* init state */
-    TG_(init_exec_stack)( &(t->states) );
-    TG_(init_call_stack)( &(t->calls) );
-    TG_(init_fn_stack)  ( &(t->fns) );
-    /* t->states.entry[0]->cxt = TG_(get_cxt)(t->fns.bottom); */
-
-    /* event counters */
-    t->lastdump_cost   = TG_(get_eventset_cost)( TG_(sets).full );
-    TG_(init_cost)( TG_(sets).full, t->lastdump_cost );
-
-    /* CSV trace: per-thread sample snapshot (allocated lazily in trace_emit_sample) */
-    t->last_sample_cost = 0;
-
-    /* init data containers */
-    TG_(init_fn_array)( &(t->fn_active) );
-    TG_(init_bbcc_hash)( &(t->bbccs) );
-    TG_(init_jcc_hash)( &(t->jccs) );
-    
-    return t;
-}
+   thread_info* t;
 
+   t = (thread_info*)TG_MALLOC("cl.threads.nt.1", sizeof(thread_info));
 
-void TG_(switch_thread)(ThreadId tid)
-{
-  if (tid == TG_(current_tid)) return;
-
-  TG_DEBUG(0, ">> thread %u (was %u)\n", tid, TG_(current_tid));
-
-  if (TG_(current_tid) != VG_INVALID_THREADID) {    
-    /* save thread state */
-    thread_info* t = thread[TG_(current_tid)];
-
-    TG_ASSERT(t != 0);
-
-    /* current context (including signal handler contexts) */
-    exec_state_save();
-    TG_(copy_current_exec_stack)( &(t->states) );
-    TG_(copy_current_call_stack)( &(t->calls) );
-    TG_(copy_current_fn_stack)  ( &(t->fns) );
-
-    TG_(copy_current_fn_array) ( &(t->fn_active) );
-    /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
-    if (!TG_(clo).separate_threads) t = thread[1];
-    TG_(copy_current_bbcc_hash)( &(t->bbccs) );
-    TG_(copy_current_jcc_hash) ( &(t->jccs) );
-  }
-
-  TG_(current_tid) = tid;
-  TG_ASSERT(tid < VG_N_THREADS);
-
-  if (tid != VG_INVALID_THREADID) {
-    thread_info* t;
-
-    /* load thread state */
-
-    if (thread[tid] == 0) thread[tid] = new_thread();
-    t = thread[tid];
-
-    /* current context (including signal handler contexts) */
-    TG_(set_current_exec_stack)( &(t->states) );
-    exec_state_restore();
-    TG_(set_current_call_stack)( &(t->calls) );
-    TG_(set_current_fn_stack)  ( &(t->fns) );
-    
-    TG_(set_current_fn_array)  ( &(t->fn_active) );
-    /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
-    if (!TG_(clo).separate_threads) t = thread[1];
-    TG_(set_current_bbcc_hash) ( &(t->bbccs) );
-    TG_(set_current_jcc_hash)  ( &(t->jccs) );
-  }
-}
+   /* init state */
+   TG_(init_exec_stack)(&(t->states));
+   TG_(init_call_stack)(&(t->calls));
+   TG_(init_fn_stack)(&(t->fns));
+   /* t->states.entry[0]->cxt = TG_(get_cxt)(t->fns.bottom); */
+
+   /* event counters */
+   t->lastdump_cost = TG_(get_eventset_cost)(TG_(sets).full);
+   TG_(init_cost)(TG_(sets).full, t->lastdump_cost);
+
+   /* CSV trace: per-thread sample snapshot (allocated lazily in
+    * trace_emit_sample) */
+   t->last_sample_cost = 0;
 
+   /* init data containers */
+   TG_(init_fn_array)(&(t->fn_active));
+   TG_(init_bbcc_hash)(&(t->bbccs));
+   TG_(init_jcc_hash)(&(t->jccs));
 
-void TG_(run_thread)(ThreadId tid)
+   return t;
+}
+
+void TG_(switch_thread)(ThreadId tid)
 {
-    TG_(switch_thread)(tid);
+   if (tid == TG_(current_tid))
+      return;
+
+   TG_DEBUG(0, ">> thread %u (was %u)\n", tid, TG_(current_tid));
+
+   if (TG_(current_tid) != VG_INVALID_THREADID) {
+      /* save thread state */
+      thread_info* t = thread[TG_(current_tid)];
+
+      TG_ASSERT(t != 0);
+
+      /* current context (including signal handler contexts) */
+      exec_state_save();
+      TG_(copy_current_exec_stack)(&(t->states));
+      TG_(copy_current_call_stack)(&(t->calls));
+      TG_(copy_current_fn_stack)(&(t->fns));
+
+      TG_(copy_current_fn_array)(&(t->fn_active));
+      /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+      if (!TG_(clo).separate_threads)
+         t = thread[1];
+      TG_(copy_current_bbcc_hash)(&(t->bbccs));
+      TG_(copy_current_jcc_hash)(&(t->jccs));
+   }
+
+   TG_(current_tid) = tid;
+   TG_ASSERT(tid < VG_N_THREADS);
+
+   if (tid != VG_INVALID_THREADID) {
+      thread_info* t;
+
+      /* load thread state */
+
+      if (thread[tid] == 0)
+         thread[tid] = new_thread();
+      t = thread[tid];
+
+      /* current context (including signal handler contexts) */
+      TG_(set_current_exec_stack)(&(t->states));
+      exec_state_restore();
+      TG_(set_current_call_stack)(&(t->calls));
+      TG_(set_current_fn_stack)(&(t->fns));
+
+      TG_(set_current_fn_array)(&(t->fn_active));
+      /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+      if (!TG_(clo).separate_threads)
+         t = thread[1];
+      TG_(set_current_bbcc_hash)(&(t->bbccs));
+      TG_(set_current_jcc_hash)(&(t->jccs));
+   }
 }
 
+void TG_(run_thread)(ThreadId tid) { TG_(switch_thread)(tid); }
+
 void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack)
 {
-    exec_state *es;
+   exec_state* es;
 
-    TG_DEBUG(0, ">> pre_signal(TID %u, sig %d, alt_st %s)\n",
-	     tid, sigNum, alt_stack ? "yes":"no");
+   TG_DEBUG(0, ">> pre_signal(TID %u, sig %d, alt_st %s)\n", tid, sigNum,
+            alt_stack ? "yes" : "no");
 
-    /* switch to the thread the handler runs in */
-    TG_(switch_thread)(tid);
+   /* switch to the thread the handler runs in */
+   TG_(switch_thread)(tid);
 
-    /* save current execution state */
-    exec_state_save();
+   /* save current execution state */
+   exec_state_save();
 
-    /* setup new cxtinfo struct for this signal handler */
-    es = push_exec_state(sigNum);
-    TG_(zero_cost)( TG_(sets).full, es->cost );
-    TG_(current_state).cost = es->cost;
-    es->call_stack_bottom = TG_(current_call_stack).sp;
+   /* setup new cxtinfo struct for this signal handler */
+   es = push_exec_state(sigNum);
+   TG_(zero_cost)(TG_(sets).full, es->cost);
+   TG_(current_state).cost = es->cost;
+   es->call_stack_bottom   = TG_(current_call_stack).sp;
 
-    /* setup current state for a spontaneous call */
-    TG_(init_exec_state)( &TG_(current_state) );
-    TG_(current_state).sig = sigNum;
-    TG_(push_cxt)(0);
+   /* setup current state for a spontaneous call */
+   TG_(init_exec_state)(&TG_(current_state));
+   TG_(current_state).sig = sigNum;
+   TG_(push_cxt)(0);
 }
 
 /* Run post-signal if the stackpointer for call stack is at
@@ -214,75 +204,72 @@ void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack)
  */
 void TG_(run_post_signal_on_call_stack_bottom)(void)
 {
-    exec_state* es = top_exec_state();
-    TG_ASSERT(es != 0);
-    TG_ASSERT(TG_(current_state).sig >0);
+   exec_state* es = top_exec_state();
+   TG_ASSERT(es != 0);
+   TG_ASSERT(TG_(current_state).sig > 0);
 
-    if (TG_(current_call_stack).sp == es->call_stack_bottom)
-	TG_(post_signal)( TG_(current_tid), TG_(current_state).sig );
+   if (TG_(current_call_stack).sp == es->call_stack_bottom)
+      TG_(post_signal)(TG_(current_tid), TG_(current_state).sig);
 }
 
 void TG_(post_signal)(ThreadId tid, Int sigNum)
 {
-    exec_state* es;
-    UInt fn_number, *pactive;
-
-    TG_DEBUG(0, ">> post_signal(TID %u, sig %d)\n",
-	     tid, sigNum);
-
-    /* thread switching potentially needed, eg. with instrumentation off */
-    TG_(switch_thread)(tid);
-    TG_ASSERT(sigNum == TG_(current_state).sig);
-
-    /* Unwind call stack of this signal handler.
-     * This should only be needed at finalisation time
-     */
-    es = top_exec_state();
-    TG_ASSERT(es != 0);
-    while(TG_(current_call_stack).sp > es->call_stack_bottom)
+   exec_state* es;
+   UInt        fn_number, *pactive;
+
+   TG_DEBUG(0, ">> post_signal(TID %u, sig %d)\n", tid, sigNum);
+
+   /* thread switching potentially needed, eg. with instrumentation off */
+   TG_(switch_thread)(tid);
+   TG_ASSERT(sigNum == TG_(current_state).sig);
+
+   /* Unwind call stack of this signal handler.
+    * This should only be needed at finalisation time
+    */
+   es = top_exec_state();
+   TG_ASSERT(es != 0);
+   while (TG_(current_call_stack).sp > es->call_stack_bottom)
       TG_(pop_call_stack)();
-    
-    if (TG_(current_state).cxt) {
+
+   if (TG_(current_state).cxt) {
       /* correct active counts */
       fn_number = TG_(current_state).cxt->fn[0]->number;
-      pactive = TG_(get_fn_entry)(fn_number);
+      pactive   = TG_(get_fn_entry)(fn_number);
       (*pactive)--;
       TG_DEBUG(0, "  set active count of %s back to %u\n",
-	       TG_(current_state).cxt->fn[0]->name, *pactive);
-    }
-
-    if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) {
-	/* set fn_stack_top back.
-	 * top can point to 0 if nothing was executed in the signal handler;
-	 * this is possible at end on unwinding handlers.
-	 */
-	if (*(TG_(current_fn_stack).top) != 0) {
-	    TG_(current_fn_stack).top--;
-	    TG_ASSERT(*(TG_(current_fn_stack).top) == 0);
-	}
+               TG_(current_state).cxt->fn[0]->name, *pactive);
+   }
+
+   if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) {
+      /* set fn_stack_top back.
+       * top can point to 0 if nothing was executed in the signal handler;
+       * this is possible at end on unwinding handlers.
+       */
+      if (*(TG_(current_fn_stack).top) != 0) {
+         TG_(current_fn_stack).top--;
+         TG_ASSERT(*(TG_(current_fn_stack).top) == 0);
+      }
       if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom)
-	TG_(current_fn_stack).top--;
-    }
-
-    /* zero signal handler costs before restoring previous context */
-    TG_ASSERT(TG_(current_state).cost == es->cost);
-    TG_(zero_cost)( TG_(sets).full, TG_(current_state).cost );
-
-    /* restore previous context */
-    es->sig = -1;
-    current_states.sp--;
-    es = top_exec_state();
-    TG_(current_state).sig = es->sig;
-    exec_state_restore();
-
-    /* There is no way to reliable get the thread ID we are switching to
-     * after this handler returns. So we sync with actual TID at start of
-     * TG_(setup_bb)(), which should be the next for tracegrind.
-     */
+         TG_(current_fn_stack).top--;
+   }
+
+   /* zero signal handler costs before restoring previous context */
+   TG_ASSERT(TG_(current_state).cost == es->cost);
+   TG_(zero_cost)(TG_(sets).full, TG_(current_state).cost);
+
+   /* restore previous context */
+   es->sig = -1;
+   current_states.sp--;
+   es                     = top_exec_state();
+   TG_(current_state).sig = es->sig;
+   exec_state_restore();
+
+   /* There is no way to reliable get the thread ID we are switching to
+    * after this handler returns. So we sync with actual TID at start of
+    * TG_(setup_bb)(), which should be the next for tracegrind.
+    */
 }
 
-
-
 /*------------------------------------------------------------*/
 /*--- Execution states in a thread & signal handlers       ---*/
 /*------------------------------------------------------------*/
@@ -298,72 +285,68 @@ void TG_(post_signal)(ThreadId tid, Int sigNum)
 /* not initialized: call_stack_bottom, sig */
 void TG_(init_exec_state)(exec_state* es)
 {
-  es->collect = TG_(clo).collect_atstart;
-  es->cxt  = 0;
-  es->jmps_passed = 0;
-  es->bbcc = 0;
-  es->nonskipped = 0;
+   es->collect     = TG_(clo).collect_atstart;
+   es->cxt         = 0;
+   es->jmps_passed = 0;
+   es->bbcc        = 0;
+   es->nonskipped  = 0;
 }
 
-
 static exec_state* new_exec_state(Int sigNum)
 {
-    exec_state* es;
-    es = (exec_state*) TG_MALLOC("cl.threads.nes.1",
-                                  sizeof(exec_state));
-
-    /* allocate real cost space: needed as incremented by
-     * simulation functions */
-    es->cost       = TG_(get_eventset_cost)(TG_(sets).full);
-    TG_(zero_cost)( TG_(sets).full, es->cost );
-    TG_(init_exec_state)(es);
-    es->sig        = sigNum;
-    es->call_stack_bottom  = 0;
-
-    return es;
+   exec_state* es;
+   es = (exec_state*)TG_MALLOC("cl.threads.nes.1", sizeof(exec_state));
+
+   /* allocate real cost space: needed as incremented by
+    * simulation functions */
+   es->cost = TG_(get_eventset_cost)(TG_(sets).full);
+   TG_(zero_cost)(TG_(sets).full, es->cost);
+   TG_(init_exec_state)(es);
+   es->sig               = sigNum;
+   es->call_stack_bottom = 0;
+
+   return es;
 }
 
 void TG_(init_exec_stack)(exec_stack* es)
 {
-  Int i;
+   Int i;
 
-  /* The first element is for the main thread */
-  es->entry[0] = new_exec_state(0);
-  for(i=1;i<MAX_SIGHANDLERS;i++)
-    es->entry[i] = 0;
-  es->sp = 0;
+   /* The first element is for the main thread */
+   es->entry[0] = new_exec_state(0);
+   for (i = 1; i < MAX_SIGHANDLERS; i++)
+      es->entry[i] = 0;
+   es->sp = 0;
 }
 
 void TG_(copy_current_exec_stack)(exec_stack* dst)
 {
-  Int i;
+   Int i;
 
-  dst->sp = current_states.sp;
-  for(i=0;i<MAX_SIGHANDLERS;i++)
-    dst->entry[i] = current_states.entry[i];
+   dst->sp = current_states.sp;
+   for (i = 0; i < MAX_SIGHANDLERS; i++)
+      dst->entry[i] = current_states.entry[i];
 }
 
 void TG_(set_current_exec_stack)(exec_stack* dst)
 {
-  Int i;
+   Int i;
 
-  current_states.sp = dst->sp;
-  for(i=0;i<MAX_SIGHANDLERS;i++)
-    current_states.entry[i] = dst->entry[i];
+   current_states.sp = dst->sp;
+   for (i = 0; i < MAX_SIGHANDLERS; i++)
+      current_states.entry[i] = dst->entry[i];
 }
 
-
 /* Get top context info struct of current thread */
-static
-exec_state* top_exec_state(void)
+static exec_state* top_exec_state(void)
 {
-  Int sp = current_states.sp;
-  exec_state* es;
+   Int         sp = current_states.sp;
+   exec_state* es;
 
-  TG_ASSERT((sp >= 0) && (sp < MAX_SIGHANDLERS));
-  es = current_states.entry[sp];
-  TG_ASSERT(es != 0);
-  return es;
+   TG_ASSERT((sp >= 0) && (sp < MAX_SIGHANDLERS));
+   es = current_states.entry[sp];
+   TG_ASSERT(es != 0);
+   return es;
 }
 
 /* Allocates a free context info structure for a new entered
@@ -371,72 +354,71 @@ exec_state* top_exec_state(void)
  * Returns a pointer to the structure.
  */
 static exec_state* push_exec_state(int sigNum)
-{   
-  Int sp;
-  exec_state* es;
-
-  current_states.sp++;
-  sp = current_states.sp;
-
-  TG_ASSERT((sigNum > 0) && (sigNum <= _VKI_NSIG));
-  TG_ASSERT((sp > 0) && (sp < MAX_SIGHANDLERS));
-  es = current_states.entry[sp];
-  if (!es) {
-    es = new_exec_state(sigNum);
-    current_states.entry[sp] = es;
-  }
-  else
-    es->sig = sigNum;
-
-  return es;
+{
+   Int         sp;
+   exec_state* es;
+
+   current_states.sp++;
+   sp = current_states.sp;
+
+   TG_ASSERT((sigNum > 0) && (sigNum <= _VKI_NSIG));
+   TG_ASSERT((sp > 0) && (sp < MAX_SIGHANDLERS));
+   es = current_states.entry[sp];
+   if (!es) {
+      es                       = new_exec_state(sigNum);
+      current_states.entry[sp] = es;
+   } else
+      es->sig = sigNum;
+
+   return es;
 }
 
 /* Save current context to top cxtinfo struct */
-static
-exec_state* exec_state_save(void)
+static exec_state* exec_state_save(void)
 {
-  exec_state* es = top_exec_state();
-
-  es->cxt         = TG_(current_state).cxt;
-  es->collect     = TG_(current_state).collect;
-  es->jmps_passed = TG_(current_state).jmps_passed;
-  es->bbcc        = TG_(current_state).bbcc;
-  es->nonskipped  = TG_(current_state).nonskipped;
-  TG_ASSERT(es->cost == TG_(current_state).cost);
-
-  TG_DEBUGIF(1) {
-    TG_DEBUG(1, "  cxtinfo_save(sig %d): collect %s, jmps_passed %d\n",
-	     es->sig, es->collect ? "Yes": "No", es->jmps_passed);	
-    TG_(print_bbcc)(-9, es->bbcc);
-    TG_(print_cost)(-9, TG_(sets).full, es->cost);
-  }
-
-  /* signal number does not need to be saved */
-  TG_ASSERT(TG_(current_state).sig == es->sig);
-
-  return es;
+   exec_state* es = top_exec_state();
+
+   es->cxt         = TG_(current_state).cxt;
+   es->collect     = TG_(current_state).collect;
+   es->jmps_passed = TG_(current_state).jmps_passed;
+   es->bbcc        = TG_(current_state).bbcc;
+   es->nonskipped  = TG_(current_state).nonskipped;
+   TG_ASSERT(es->cost == TG_(current_state).cost);
+
+   TG_DEBUGIF(1)
+   {
+      TG_DEBUG(1, "  cxtinfo_save(sig %d): collect %s, jmps_passed %d\n",
+               es->sig, es->collect ? "Yes" : "No", es->jmps_passed);
+      TG_(print_bbcc)(-9, es->bbcc);
+      TG_(print_cost)(-9, TG_(sets).full, es->cost);
+   }
+
+   /* signal number does not need to be saved */
+   TG_ASSERT(TG_(current_state).sig == es->sig);
+
+   return es;
 }
 
-static
-exec_state* exec_state_restore(void)
+static exec_state* exec_state_restore(void)
 {
-  exec_state* es = top_exec_state();
-  
-  TG_(current_state).cxt     = es->cxt;
-  TG_(current_state).collect = es->collect;
-  TG_(current_state).jmps_passed = es->jmps_passed;
-  TG_(current_state).bbcc    = es->bbcc;
-  TG_(current_state).nonskipped = es->nonskipped;
-  TG_(current_state).cost    = es->cost;
-  TG_(current_state).sig     = es->sig;
-    
-  TG_DEBUGIF(1) {
-	TG_DEBUG(1, "  exec_state_restore(sig %d): collect %s, jmps_passed %d\n",
-		  es->sig, es->collect ? "Yes": "No", es->jmps_passed);
-	TG_(print_bbcc)(-9, es->bbcc);
-	TG_(print_cxt)(-9, es->cxt, 0);
-	TG_(print_cost)(-9, TG_(sets).full, es->cost);
-  }
-
-  return es;
+   exec_state* es = top_exec_state();
+
+   TG_(current_state).cxt         = es->cxt;
+   TG_(current_state).collect     = es->collect;
+   TG_(current_state).jmps_passed = es->jmps_passed;
+   TG_(current_state).bbcc        = es->bbcc;
+   TG_(current_state).nonskipped  = es->nonskipped;
+   TG_(current_state).cost        = es->cost;
+   TG_(current_state).sig         = es->sig;
+
+   TG_DEBUGIF(1)
+   {
+      TG_DEBUG(1, "  exec_state_restore(sig %d): collect %s, jmps_passed %d\n",
+               es->sig, es->collect ? "Yes" : "No", es->jmps_passed);
+      TG_(print_bbcc)(-9, es->bbcc);
+      TG_(print_cxt)(-9, es->cxt, 0);
+      TG_(print_cost)(-9, TG_(sets).full, es->cost);
+   }
+
+   return es;
 }
diff --git a/tracegrind/tracegrind.h b/tracegrind/tracegrind.h
index 0b37d11c8..f600cf2b7 100644
--- a/tracegrind/tracegrind.h
+++ b/tracegrind/tracegrind.h
@@ -73,15 +73,14 @@
    callgrind client request macros.
  */
 
-typedef
-   enum {
-      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'), // ignored
-      VG_USERREQ__ZERO_STATS, // ignored
-      VG_USERREQ__TOGGLE_COLLECT,
-      VG_USERREQ__ADD_MARKER,
-      VG_USERREQ__START_INSTRUMENTATION,
-      VG_USERREQ__STOP_INSTRUMENTATION
-   } Vg_TracegrindClientRequest;
+typedef enum {
+   VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C', 'T'), // ignored
+   VG_USERREQ__ZERO_STATS,                                  // ignored
+   VG_USERREQ__TOGGLE_COLLECT,
+   VG_USERREQ__ADD_MARKER,
+   VG_USERREQ__START_INSTRUMENTATION,
+   VG_USERREQ__STOP_INSTRUMENTATION
+} Vg_TracegrindClientRequest;
 
 /* Toggles collection state.
    The collection state specifies whether the happening of events
@@ -90,18 +89,17 @@ typedef
 
    Same as CALLGRIND_TOGGLE_COLLECT
    */
-#define TRACEGRIND_TOGGLE_COLLECT                                \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT,   \
-                                  0, 0, 0, 0, 0)
+#define TRACEGRIND_TOGGLE_COLLECT                                              \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT, 0, 0, 0, 0, 0)
 
 /* Add a named marker into the trace output. The argument is a string
    that will be recorded as a marker label.
 
    Same as CALLGRIND_DUMP_STATS_AT
    */
-#define TRACEGRIND_ADD_MARKER(marker_str)                           \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ADD_MARKER,          \
-                                  marker_str, 0, 0, 0, 0)
+#define TRACEGRIND_ADD_MARKER(marker_str)                                      \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ADD_MARKER, marker_str, 0, 0,   \
+                                   0, 0)
 
 /* Start full tracegrind instrumentation if not already switched on.
    When cache simulation is done, it will flush the simulated cache;
@@ -110,9 +108,9 @@ typedef
 
    Same as CALLGRIND_START_INSTRUMENTATION
    */
-#define TRACEGRIND_START_INSTRUMENTATION                              \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, \
-                                  0, 0, 0, 0, 0)
+#define TRACEGRIND_START_INSTRUMENTATION                                       \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, 0, 0, 0, \
+                                   0, 0)
 
 /* Stop full tracegrind instrumentation if not already switched off.
    This flushes Valgrinds translation cache, and does no additional
@@ -124,8 +122,8 @@ typedef
 
    Same as CALLGRIND_STOP_INSTRUMENTATION
    */
-#define TRACEGRIND_STOP_INSTRUMENTATION                               \
-  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION,  \
-                                  0, 0, 0, 0, 0)
+#define TRACEGRIND_STOP_INSTRUMENTATION                                        \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION, 0, 0, 0,  \
+                                   0, 0)
 
 #endif /* __TRACEGRIND_H */