hakmem/scripts/make_chatgpt_pro_packet_free_path.sh

#!/usr/bin/env bash
set -euo pipefail

# Generate a compact "free-path review packet" for sharing with ChatGPT Pro.
# Output: Markdown to stdout (copy/paste).
#
# Usage:
#   scripts/make_chatgpt_pro_packet_free_path.sh > /tmp/free_path_packet.md
#
# Notes:
# - Extracts key functions with a simple brace counter.
# - Clips each snippet to keep it shareable.

root_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${root_dir}"

# Default clip is intentionally small; you can override via CLIP_LINES=...
clip="${CLIP_LINES:-160}"

need() { command -v "$1" >/dev/null 2>&1 || { echo "[packet] missing $1" >&2; exit 1; }; }
need awk
need sed

extract_func_n_clip() {
  local file="$1"
  local re="$2"
  local nth="$3"
  local clip_lines="$4"

  awk -v re="${re}" -v nth="${nth}" '
    function count_char(s, c,   i,n) { n=0; for (i=1;i<=length(s);i++) if (substr(s,i,1)==c) n++; return n }
    BEGIN { hit=0; started=0; depth=0; seen_open=0 }
    {
      if (!started) {
        if ($0 ~ re) {
          hit++;
          if (hit == nth) {
            started=1;
          }
        }
      }
      if (started) {
        print $0;
        depth += count_char($0, "{");
        if (count_char($0, "{") > 0) seen_open=1;
        depth -= count_char($0, "}");
        if (seen_open && depth <= 0) exit 0;
      }
    }
  ' "${file}" | sed -n "1,${clip_lines}p"
}

extract_func() {
  extract_func_n_clip "$1" "$2" 1 "${clip}"
}

md_code() {
  local lang="$1"
  local file="$2"
  echo ""
  echo "### \`${file}\`"
  echo "\`\`\`${lang}"
  cat
  echo "\`\`\`"
}

cat <<'MD'
# Hakmem free-path review packet (compact)

Goal: understand remaining fixed costs vs mimalloc/tcmalloc, with Box Theory (single boundary, reversible ENV gates).

SSOT bench conditions (current practice):
- `HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE`
- `ITERS=20000000 WS=400 RUNS=10`
- run via `scripts/run_mixed_10_cleanenv.sh`

Request:
1) Where is the dominant fixed cost on free path now?
2) What structural change would give +5–10% without breaking Box Theory?
3) What NOT to do (layout tax pitfalls)?
MD

echo ""
echo "## Code excerpts (clipped)"

# We focus on the hot tiny-free pipeline (the most actionable for instruction/branch work).
# If the reviewer needs wrapper/registry code too, we can provide a larger packet.

# A) tiny_free_gate_try_fast(): user_ptr -> class_idx/base -> tiny_hot_free_fast()/fallback
extract_func core/box/tiny_free_gate_box.h '^static inline int tiny_free_gate_try_fast\\(void\\* user_ptr\\)' | md_code c core/box/tiny_free_gate_box.h

# B) free_tiny_fast(): main Tiny free dispatcher (hot/cold + env snapshot)
extract_func_n_clip core/front/malloc_tiny_fast.h '^static inline int free_tiny_fast\\(void\\* ptr\\)' 1 220 | md_code c core/front/malloc_tiny_fast.h

# C) tiny_hot_free_fast(): TLS unified cache push
extract_func core/box/tiny_front_hot_box.h '^static inline int tiny_hot_free_fast\\(int class_idx, void\\* base\\)' | md_code c core/box/tiny_front_hot_box.h

# D) tiny_legacy_fallback_free_base_with_env(): inline-slots cascade + unified_cache_push(_fast)
extract_func_n_clip core/box/tiny_legacy_fallback_box.h '^static inline void tiny_legacy_fallback_free_base_with_env\\(void\\* base, uint32_t class_idx, const HakmemEnvSnapshot\\* env\\)' 1 260 | md_code c core/box/tiny_legacy_fallback_box.h

cat <<'MD'

## Questions to answer (please be concrete)

1) In these snippets, which checks/branches are still "per-op fixed taxes" on the hot free path?
   - Please point to specific lines/conditions and estimate cost (branches/instructions or dependency chain).

2) Is `tiny_hot_free_fast()` already close to optimal, and the real bottleneck is upstream (user->base/classify/route)?
   - If yes, what’s the smallest structural refactor that removes that upstream fixed tax?

3) Should we introduce a "commit once" plan (freeze the chosen free path) — or is branch prediction already making lazy-init checks ~free here?
   - If "commit once", where should it live to avoid runtime gate overhead (bench_profile refresh boundary vs per-op)?

4) We have had many layout-tax regressions from code removal/reordering.
   - What patterns here are most likely to trigger layout tax if changed?
   - How would you stage a safe A/B (same binary, ENV toggle) for your proposal?

5) If you could change just ONE of:
   - pointer classification to base/class_idx,
   - route determination,
   - unified cache push/pop structure,
   which is highest ROI for +5–10% on WS=400?

MD

echo ""
echo "[packet] done"