From 505109ca4f885aafe979c5c3a7efb72e105d791b Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 21 May 2026 17:38:11 +0200
Subject: [PATCH 01/31] Speed up ribbon generation

---
 recon_surf/recon-surf.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index e0e480493..ff0b06fd6 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1089,8 +1089,13 @@ then
   # anatomical stats can run without ribbon, but will omit some surface based measures then
   # wmparc needs ribbon, probably other stuff (aparc to aseg etc).
   # So lets run it to have these measures below.
-  cmd="recon-all -subject $subject -cortribbon -umask $(umask) $hiresflag $fsthreads"
+  cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
+    --label_right_white 41 --label_right_ribbon 42 --save_ribbon"
+  if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
+  cmd="$cmd $subject"
   RunIt "$cmd" "$LF"
+  mkdir -p "$SUBJECTS_DIR/$subject/touch"
+  echo "$cmd" > "$SUBJECTS_DIR/$subject/touch/cortical_ribbon.touch"
 
 fi # skip in base
 

From d3741d1a2767f2aa82d959b50b55199c193c0f05 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 21 May 2026 17:38:27 +0200
Subject: [PATCH 02/31] Speed up spherical morphing

---
 recon_surf/recon-surf.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index ff0b06fd6..0e2aca5aa 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -872,9 +872,14 @@ for hemi in lh rh ; do
     if [[ "$long" == "false" ]]
     then
 
-      # SPHERE: Inflate to sphere with minimal metric distortion
-      cmd="recon-all -subject $subject -hemi $hemi -sphere $hiresflag -no-isrunning -umask $(umask) $fsthreads"
+      # SPHERE: Inflate to sphere with minimal metric distortion.
+      # This step is deterministic with the fixed seed and scales better when it can use the full requested
+      # thread count.  Run it directly to avoid constraining it to per-hemisphere threads.
+      cmd="env OMP_NUM_THREADS=$threads ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS=1 \
+        mris_sphere -seed 1234 $sdir/${hemi}.inflated $sdir/${hemi}.sphere"
       RunIt "$cmd" "$LF" "$CMDF"
+      echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+      echo "echo \"mris_sphere -seed 1234 ../surf/${hemi}.inflated ../surf/${hemi}.sphere\" > $SUBJECTS_DIR/$subject/touch/${hemi}.sphmorph.touch" >> "$CMDF"
 
       # SURFREG (sphere.reg)
       # Surface registration for cross-subject correspondence (registration to fsaverage)

From 7b2985cea845c3ba76763a79002938b386a71401 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 21 May 2026 17:38:44 +0200
Subject: [PATCH 03/31] Overlap surface statistics

---
 recon_surf/recon-surf.sh | 101 +++++++++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 20 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 0e2aca5aa..695c440a0 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -394,6 +394,56 @@ if [[ "$DoneFile" != /dev/null ]] ; then rm -f "$DoneFile" ; fi
 LF="$SUBJECTS_DIR/$subject/scripts/recon-surf.log"
 if [[ "$LF" != /dev/null ]]  && [[ "$edits" != "true" ]]; then rm -f "$LF" ; fi
 echo "Log file for recon-surf.sh" >> "$LF"
+
+ASYNC_PIDS=()
+ASYNC_LOGS=()
+ASYNC_CMDFS=()
+
+function start_async_cmdf()
+{
+  local cmdf=$1
+  local log="$cmdf.log"
+  chmod u+x "$cmdf"
+  printf "\n %s\n\n" "$cmdf" > "$log"
+  "$cmdf" >> "$log" 2>&1 &
+  ASYNC_PIDS+=("$!")
+  ASYNC_LOGS+=("$log")
+  ASYNC_CMDFS+=("$cmdf")
+}
+
+function wait_async_cmdfs()
+{
+  local unsuccessful=()
+  local status
+  local i
+  for i in "${!ASYNC_PIDS[@]}"
+  do
+    echo "Waiting for async PID ${ASYNC_PIDS[i]} of (${ASYNC_PIDS[*]}) to complete..." | tee -a "$LF"
+    wait "${ASYNC_PIDS[i]}"
+    status="$?"
+    tee -a "$LF" < "${ASYNC_LOGS[i]}"
+    rm -f "${ASYNC_LOGS[i]}"
+    if [[ "$status" != "0" ]]
+    then
+      unsuccessful+=("$i")
+      {
+        echo "ERROR: The async script ${ASYNC_CMDFS[i]} (PID: ${ASYNC_PIDS[i]}) did not complete successfully!"
+        echo "========================================"
+        echo ""
+      } | tee -a "$LF"
+    fi
+  done
+
+  if [[ "${#unsuccessful}" != 0 ]]
+  then
+    echo "Async PIDs (${unsuccessful[*]}) of (${ASYNC_PIDS[*]}) have NOT completed successfully! All logs appended." | tee -a "$LF"
+    exit 1
+  elif [[ "${#ASYNC_PIDS[@]}" != 0 ]]
+  then
+    echo "Async PIDs (${ASYNC_PIDS[*]}) completed successfully! Their logs have been appended." | tee -a "$LF"
+  fi
+}
+
 { # all output tee -a "$LF"
   date 2>&1
   echo " "
@@ -1160,18 +1210,41 @@ then
 
 # ============================= MAPPED SURF-STATS =========================================
 
+  MAPPED_STATS_CMDF="$SUBJECTS_DIR/$subject/scripts/mapped_stats.cmdf"
+  rm -f "$MAPPED_STATS_CMDF"
   {
-    echo ""
-    echo "===================== Creating surfaces - mapped stats ========================="
-    echo ""
-  } | tee -a "$LF"
+    echo "#!/bin/bash"
+    echo "echo \"\""
+    echo "echo \"===================== Creating surfaces - mapped stats =========================\""
+    echo "echo \"\""
+  } > "$MAPPED_STATS_CMDF"
 
-  # 2x18sec create stats from mapped aparc
+  # 2x18sec create stats from mapped aparc.  This only depends on completed surfaces and
+  # ribbon outputs, so it can overlap with the later volume-labeling/statistics chain.
   for hemi in lh rh
   do
     cmd="mris_anatomical_stats -th3 -mgz -cortex $ldir/$hemi.cortex.label -f $statsdir/$hemi.aparc.DKTatlas.mapped.stats -b -a $ldir/$hemi.aparc.DKTatlas.mapped.annot -c $ldir/aparc.annot.mapped.ctab $subject $hemi white"
-    RunIt "$cmd" "$LF"
+    RunIt "$cmd" "$LF" "$MAPPED_STATS_CMDF"
   done
+  start_async_cmdf "$MAPPED_STATS_CMDF"
+
+  if [[ "$fssurfreg" == "true" ]]
+  then
+    BALABELS_CMDF="$SUBJECTS_DIR/$subject/scripts/balabels.cmdf"
+    rm -f "$BALABELS_CMDF"
+    {
+      echo "#!/bin/bash"
+      echo "echo \"\""
+      echo "echo \"===================== Creating surfaces - BA labels ============================\""
+      echo "echo \"\""
+    } > "$BALABELS_CMDF"
+
+    # BA labels only depend on completed surface registration and surface geometry, so run
+    # them while the main process creates the mapped volumes and segmentation statistics.
+    cmd="$python ${binpath}/fs_balabels.py --sd $SUBJECTS_DIR --sid $subject"
+    RunIt "$cmd" "$LF" "$BALABELS_CMDF"
+    start_async_cmdf "$BALABELS_CMDF"
+  fi
 
 # ============================= FASTSURFER - surfcon hypo stats =========================================
 
@@ -1348,22 +1421,10 @@ then
   fi
 
 
-# ============================= BALABELS =========================================
-
-  # balabels need sphere.reg
-  if [[ "$fssurfreg" == "true" ]]
-  then
-    # can be produced if surf registration exists
-    #cmd="recon-all -subject $subject -balabels $hiresflag $fsthreads"
-    #RunIt "$cmd" "$LF"
-    # here we run our version of balabels: mapping and annot creation is very fast
-    # time is used in mris_anatomical_stats (called 4 times, BA and BA-thresh for each hemi)
-    cmd="$python ${binpath}/fs_balabels.py --sd $SUBJECTS_DIR --sid $subject"
-    RunIt "$cmd" "$LF"
-  fi
-
 fi # not base run
 
+wait_async_cmdfs
+
 
 # Collect info
 EndTime=$(date)

From 8af67fd76a456b92594909c5808026dbd828e2c8 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 14 May 2026 16:33:33 +0200
Subject: [PATCH 04/31] Speed up surface wrapper steps

---
 recon_surf/check_surface_volume_info.py |  35 +++++
 recon_surf/recon-surf.sh                | 182 +++++++++++++++++++++---
 2 files changed, 196 insertions(+), 21 deletions(-)
 create mode 100644 recon_surf/check_surface_volume_info.py

diff --git a/recon_surf/check_surface_volume_info.py b/recon_surf/check_surface_volume_info.py
new file mode 100644
index 000000000..229b985a6
--- /dev/null
+++ b/recon_surf/check_surface_volume_info.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""Check that a FreeSurfer surface has valid volume metadata."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from nibabel.freesurfer.io import read_geometry
+
+
+def options_parse() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("surface", help="FreeSurfer surface to check")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = options_parse()
+    info = read_geometry(args.surface, read_metadata=True)[2]
+    head = list(info.get("head", []))
+    valid = str(info.get("valid", "")).startswith("1")
+    if valid and head == [2, 0, 20]:
+        return 0
+    print(
+        f"Invalid surface volume metadata in {args.surface}: "
+        f"valid={info.get('valid')!r}, head={head!r}",
+        file=sys.stderr,
+    )
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 695c440a0..1dbd96150 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -442,6 +442,9 @@ function wait_async_cmdfs()
   then
     echo "Async PIDs (${ASYNC_PIDS[*]}) completed successfully! Their logs have been appended." | tee -a "$LF"
   fi
+  ASYNC_PIDS=()
+  ASYNC_LOGS=()
+  ASYNC_CMDFS=()
 }
 
 { # all output tee -a "$LF"
@@ -762,13 +765,10 @@ for hemi in lh rh ; do
       #cmd="$python ${binpath}rewrite_mc_surface.py --input $outmesh --output $outmesh --filename_pretess $mdir/filled-pretess$hemivalue.mgz"
       #RunIt "$cmd" "$LF" "$CMDF"
 
-      # Check if the surfaceRAS was correctly set and exit otherwise (sanity check in case nibabel changes their default header behaviour)
-      {
-        cmd="mris_info $outmesh | tr -s ' ' | grep -q 'vertex locs : surfaceRAS'"
-        echo "echo \"$cmd\""
-        echo "$timecmd $cmd"
-      } | tee -a "$CMDF"
-      echo "if [ \${PIPESTATUS[1]} -ne 0 ] ; then echo \"Incorrect header information detected in $outmesh: vertex locs is not set to surfaceRAS. Exiting... \" ; exit 1 ; fi" >> "$CMDF"
+      # Check that mri_mc wrote valid surface volume metadata.  This replaces a
+      # full mris_info scan with a direct nibabel metadata read.
+      cmda=($python "${binpath}check_surface_volume_info.py" "$outmesh")
+      run_it_cmdf "$LF" "$CMDF" "${cmda[@]}"
 
       # Reduce to largest component (usually there should only be one)
       cmd="mris_extract_main_component $outmesh $outmesh"
@@ -814,9 +814,12 @@ for hemi in lh rh ; do
       echo "echo \"\""
     } | tee -a "$CMDF"
 
-    #surface inflation (54sec both hemis) (needed for qsphere and for topo-fixer)
-    cmd="recon-all -subject $subject -hemi $hemi -inflate1 -no-isrunning -umask $(umask) $hiresflag $fsthreads"
+    # surface inflation (needed for qsphere and for topo-fixer).  Run the
+    # underlying command directly to avoid recon-all wrapper overhead.
+    cmd="mris_inflate -no-save-sulc $sdir/$hemi.smoothwm.nofix $sdir/$hemi.inflated.nofix"
     RunIt "$cmd" "$LF" "$CMDF"
+    echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+    echo "echo \"mris_inflate -no-save-sulc ../surf/${hemi}.smoothwm.nofix ../surf/${hemi}.inflated.nofix\" > $SUBJECTS_DIR/$subject/touch/${hemi}.inflate1.touch" >> "$CMDF"
 
     if [ "$fsqsphere" == "true" ]
     then
@@ -854,8 +857,15 @@ for hemi in lh rh ; do
     RunIt "$cmd" "$LF" "$CMDF"
 
     # create first WM surface white.preaparc from topo fixed orig surf
-    cmd="recon-all -subject $subject -hemi $hemi -autodetgwstats -white-preaparc -no-isrunning -umask $(umask) $hiresflag $fsthreads"
+    echo "pushd $mdir > /dev/null || exit 1" >> "$CMDF"
+    cmd="mris_autodet_gwstats --o ../surf/autodet.gw.stats.$hemi.dat --i brain.finalsurfs.mgz --wm wm.mgz --surf ../surf/$hemi.orig.premesh"
+    RunIt "$cmd" "$LF" "$CMDF"
+    echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+    echo "echo \"mris_autodet_gwstats --o ../surf/autodet.gw.stats.${hemi}.dat --i brain.finalsurfs.mgz --wm wm.mgz --surf ../surf/${hemi}.orig.premesh\" > $SUBJECTS_DIR/$subject/touch/${hemi}.autodet.gw.stats.touch" >> "$CMDF"
+    cmd="mris_place_surface --adgws-in ../surf/autodet.gw.stats.$hemi.dat --wm wm.mgz --threads $threads_hemi --invol brain.finalsurfs.mgz --$hemi --i ../surf/$hemi.orig --o ../surf/$hemi.white.preaparc --white --seg aseg.presurf.mgz --nsmooth 5"
     RunIt "$cmd" "$LF" "$CMDF"
+    echo "echo \"mris_place_surface --adgws-in ../surf/autodet.gw.stats.${hemi}.dat --wm wm.mgz --threads $threads_hemi --invol brain.finalsurfs.mgz --${hemi} --i ../surf/${hemi}.orig --o ../surf/${hemi}.white.preaparc --white --seg aseg.presurf.mgz --nsmooth 5\" > $SUBJECTS_DIR/$subject/touch/${hemi}.white.preaparc.touch" >> "$CMDF"
+    echo "popd > /dev/null || exit 1" >> "$CMDF"
 
   else # longitudinal stream
     # ... we skip topo fix
@@ -882,9 +892,39 @@ for hemi in lh rh ; do
 
   # create cortex label (1min)
   # create nicer inflated surface from topo fixed (not needed, just later for visualization)
-  # identical for long processing
-  cmd="recon-all -subject $subject -hemi $hemi -cortex-label -smooth2 -inflate2 -curvHK -no-isrunning -umask $(umask) $hiresflag $fsthreads"
+  # identical for long processing. Run the underlying commands directly to avoid
+  # repeated recon-all wrapper/update-check overhead.
+  echo "pushd $mdir > /dev/null || exit 1" >> "$CMDF"
+  cmd="mri_label2label --label-cortex ../surf/$hemi.white.preaparc aseg.presurf.mgz 0 ../label/$hemi.cortex.label"
+  RunIt "$cmd" "$LF" "$CMDF"
+  echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+  echo "echo \"mri_label2label --label-cortex ../surf/${hemi}.white.preaparc aseg.presurf.mgz 0 ../label/${hemi}.cortex.label\" > $SUBJECTS_DIR/$subject/touch/${hemi}.cortex.touch" >> "$CMDF"
+  cmd="mri_label2label --label-cortex ../surf/$hemi.white.preaparc aseg.presurf.mgz 1 ../label/$hemi.cortex+hipamyg.label"
+  RunIt "$cmd" "$LF" "$CMDF"
+  echo "echo \"mri_label2label --label-cortex ../surf/${hemi}.white.preaparc aseg.presurf.mgz 1 ../label/${hemi}.cortex+hipamyg.label\" > $SUBJECTS_DIR/$subject/touch/${hemi}.cortex+hipamyg.touch" >> "$CMDF"
+  echo "popd > /dev/null || exit 1" >> "$CMDF"
+  cmd="mris_smooth -n 3 -nw -seed 1234 $sdir/$hemi.white.preaparc $sdir/$hemi.smoothwm"
+  RunIt "$cmd" "$LF" "$CMDF"
+  echo "echo \"mris_smooth -n 3 -nw -seed 1234 ../surf/${hemi}.white.preaparc ../surf/${hemi}.smoothwm\" > $SUBJECTS_DIR/$subject/touch/${hemi}.smoothwm2.touch" >> "$CMDF"
+  cmd="mris_inflate $sdir/$hemi.smoothwm $sdir/$hemi.inflated"
+  RunIt "$cmd" "$LF" "$CMDF"
+  echo "echo \"mris_inflate ../surf/${hemi}.smoothwm ../surf/${hemi}.inflated\" > $SUBJECTS_DIR/$subject/touch/${hemi}.inflate2.touch" >> "$CMDF"
+  echo "pushd $sdir > /dev/null || exit 1" >> "$CMDF"
+  cmd="mris_curvature -w -seed 1234 $hemi.white.preaparc"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="rm -f $hemi.white.H"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="ln -s $hemi.white.preaparc.H $hemi.white.H"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="rm -f $hemi.white.K"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="ln -s $hemi.white.preaparc.K $hemi.white.K"
   RunIt "$cmd" "$LF" "$CMDF"
+  echo "echo \"mris_curvature -w -seed 1234 ${hemi}.white.preaparc\" > $SUBJECTS_DIR/$subject/touch/${hemi}.white.H.K.touch" >> "$CMDF"
+  cmd="mris_curvature -seed 1234 -thresh .999 -n -a 5 -w -distances 10 10 $hemi.inflated"
+  RunIt "$cmd" "$LF" "$CMDF"
+  echo "echo \"mris_curvature -seed 1234 -thresh .999 -n -a 5 -w -distances 10 10 ${hemi}.inflated\" > $SUBJECTS_DIR/$subject/touch/${hemi}.inflate.H.K.touch" >> "$CMDF"
+  echo "popd > /dev/null || exit 1" >> "$CMDF"
 
 
 # ============================= MAP-DKT ==========================================================
@@ -981,8 +1021,13 @@ for hemi in lh rh ; do
 
     # in all cases where sphere.reg is available, create jacobian white (distortion to sphere)
     # and avgcurv (map atlas curvature to subject):
-    cmd="recon-all -subject $subject -hemi $hemi -jacobian_white -avgcurv -no-isrunning -umask $(umask) $hiresflag $fsthreads"
+    cmd="mris_jacobian $sdir/$hemi.white.preaparc $sdir/$hemi.sphere.reg $sdir/$hemi.jacobian_white"
+    RunIt "$cmd" "$LF" "$CMDF"
+    echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+    echo "echo \"mris_jacobian ../surf/${hemi}.white.preaparc ../surf/${hemi}.sphere.reg ../surf/${hemi}.jacobian_white\" > $SUBJECTS_DIR/$subject/touch/${hemi}.jacobian_white.touch" >> "$CMDF"
+    cmd="mrisp_paint -a 5 $FREESURFER_HOME/average/${hemi}.folding.atlas.acfb40.noaparc.i12.2016-08-02.tif#6 $sdir/$hemi.sphere.reg $sdir/$hemi.avg_curv"
     RunIt "$cmd" "$LF" "$CMDF"
+    echo "echo \"mrisp_paint -a 5 $FREESURFER_HOME/average/${hemi}.folding.atlas.acfb40.noaparc.i12.2016-08-02.tif#6 ../surf/${hemi}.sphere.reg ../surf/${hemi}.avg_curv\" > $SUBJECTS_DIR/$subject/touch/${hemi}.avgcurv.touch" >> "$CMDF"
 
   fi
 
@@ -1094,9 +1139,18 @@ for hemi in lh rh ; do
 
 # ============================= CURVSTATS ===============================================
 
-  # in FS7 curvstats moves here
-  cmd="recon-all -subject $subject -hemi $hemi -curvstats -no-isrunning -umask $(umask) $hiresflag $fsthreads"
+  # in FS7 curvstats moves here.  The maps above already exist, so run the
+  # data-producing curvstats commands directly and skip recon-all update checks.
+  cmd="mris_calc -o $sdir/$hemi.area.mid $sdir/$hemi.area add $sdir/$hemi.area.pial"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="mris_calc -o $sdir/$hemi.area.mid $sdir/$hemi.area.mid div 2"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="mris_convert --volume $subject $hemi $sdir/$hemi.volume"
+  RunIt "$cmd" "$LF" "$CMDF"
+  cmd="mris_curvature_stats -m --writeCurvatureFiles -G -o $statsdir/$hemi.curv.stats -F smoothwm $subject $hemi curv sulc"
   RunIt "$cmd" "$LF" "$CMDF"
+  echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+  echo "echo \"mris_curvature_stats -m --writeCurvatureFiles -G -o ../stats/${hemi}.curv.stats -F smoothwm $subject $hemi curv sulc\" > $SUBJECTS_DIR/$subject/touch/${hemi}.curvstats.touch" >> "$CMDF"
 
   if [[ "$ParallelHemi" == "false" ]]
   then
@@ -1260,20 +1314,93 @@ then
       softlink_or_copy "lh.aparc.DKTatlas.mapped.annot" "lh.aparc.annot" "$LF"
       softlink_or_copy "rh.aparc.DKTatlas.mapped.annot" "rh.aparc.annot" "$LF"
     popd > /dev/null || (echo "Could not popd" ; exit 1)
+
+    PCTSURFCON_CMDF="$SUBJECTS_DIR/$subject/scripts/pctsurfcon.cmdf"
+    rm -f "$PCTSURFCON_CMDF"
+    {
+      echo "#!/bin/bash"
+      echo "echo \"\""
+      echo "echo \"===================== Creating surfaces - pctsurfcon =====================\""
+      echo "echo \"\""
+    } > "$PCTSURFCON_CMDF"
     for hemi in lh rh ; do
       cmd="pctsurfcon --s $subject --$hemi-only"
-      RunIt "$cmd" "$LF"
+      RunIt "$cmd" "$LF" "$PCTSURFCON_CMDF"
     done
+    start_async_cmdf "$PCTSURFCON_CMDF"
+
+    # -hyporelabel creates aseg.presurf.hypos.mgz from aseg.presurf.mgz.
+    # -apas2aseg creates aseg.mgz by editing aseg.presurf.hypos.mgz with surfaces.
+    # Run the underlying commands directly to avoid recon-all wrapper/update-check overhead.
+    pushd "$mdir" > /dev/null || (echo "Could not cd to $mdir" ; exit 1)
+      cmd="mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz"
+      RunIt "$cmd" "$LF"
+      mkdir -p "$SUBJECTS_DIR/$subject/touch"
+      echo "mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz" > "$SUBJECTS_DIR/$subject/touch/relabelhypos.touch"
+      cmd="mri_surf2volseg --o aseg.mgz --i aseg.presurf.hypos.mgz --fix-presurf-with-ribbon $mdir/ribbon.mgz --threads $threads --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
+      RunIt "$cmd" "$LF"
+      echo "mri_surf2volseg --o aseg.mgz --i aseg.presurf.hypos.mgz --fix-presurf-with-ribbon ../mri/ribbon.mgz --threads $threads --lh-cortex-mask ../label/lh.cortex.label --lh-white ../surf/lh.white --lh-pial ../surf/lh.pial --rh-cortex-mask ../label/rh.cortex.label --rh-white ../surf/rh.white --rh-pial ../surf/rh.pial" > "$SUBJECTS_DIR/$subject/touch/apas2aseg.touch"
+    popd > /dev/null || (echo "Could not popd" ; exit 1)
+
+    wait_async_cmdfs
+
     pushd "$ldir" > /dev/null || (echo "Could not cd to $ldir" ; exit 1)
       cmd="rm *h.aparc.annot"
       RunIt "$cmd" "$LF"
     popd > /dev/null || (echo "Could not popd" ; exit 1)
+  fi
 
-    # 25 sec hyporelabel run whatever else can be done without sphere, cortical ribbon and segmentations
-    # -hyporelabel creates aseg.presurf.hypos.mgz from aseg.presurf.mgz
-    # -apas2aseg creates aseg.mgz by editing aseg.presurf.hypos.mgz with surfaces
-    cmd="recon-all -subject $subject -hyporelabel -apas2aseg -umask $(umask) $hiresflag $fsthreads"
-    RunIt "$cmd" "$LF"
+  ASYNC_ASEG_STATS="false"
+  if [[ "$segstats_legacy" != "true" ]]
+  then
+    ASEG_STATS_CMDF="$SUBJECTS_DIR/$subject/scripts/aseg_stats.cmdf"
+    rm -f "$ASEG_STATS_CMDF"
+    {
+      echo "#!/bin/bash"
+      echo "echo \"\""
+      echo "echo \"===================== Creating surfaces - aseg stats =====================\""
+      echo "echo \"\""
+    } > "$ASEG_STATS_CMDF"
+
+    printf -v mask_measure "%q" "Mask($mask)"
+    cmda=($python "$FASTSURFER_HOME/FastSurferCNN/segstats.py" --sid "$subject"
+          --segfile "$mdir/aseg.mgz" --segstatsfile "$statsdir/aseg.stats"
+          --pvfile "$mdir/norm.mgz" --normfile "$mdir/norm.mgz" --threads "$threads"
+          --excludeid 0 2 3 41 42
+          --lut "$FREESURFER_HOME/ASegStatsLUT.txt" --empty
+          measures --compute "BrainSeg" "BrainSegNotVent" "VentricleChoroidVol"
+                             "lhCortex" "rhCortex" "Cortex" "lhCerebralWhiteMatter"
+                             "rhCerebralWhiteMatter" "CerebralWhiteMatter"
+                             "SubCortGray" "TotalGray" "SupraTentorial"
+                             "SupraTentorialNotVent" "$mask_measure"
+                             "BrainSegVol-to-eTIV" "MaskVol-to-eTIV")
+    if [[ "$long" == "false" ]] ; then cmda+=("lhSurfaceHoles" "rhSurfaceHoles" "SurfaceHoles") ; fi
+    cmda+=("EstimatedTotalIntraCranialVol")
+    run_it_cmdf "$LF" "$ASEG_STATS_CMDF" "${cmda[@]}"
+
+    echo "echo \"Extract the brainvol stats section from segstats output.\"" >> "$ASEG_STATS_CMDF"
+    cmda=($python "$FASTSURFER_HOME/FastSurferCNN/segstats.py" --sid "$subject"
+          --segfile "$mdir/aseg.mgz" --pvfile "$mdir/norm.mgz"
+          --measure_only --threads "$threads" --segstatsfile "$statsdir/brainvol.stats"
+          measures --file "$statsdir/aseg.stats"
+                   --import "BrainSeg" "BrainSegNotVent" "SupraTentorial"
+                            "SupraTentorialNotVent" "SubCortGray" "lhCortex" "rhCortex"
+                            "Cortex" "TotalGray" "lhCerebralWhiteMatter"
+                            "rhCerebralWhiteMatter" "CerebralWhiteMatter" "Mask"
+                   --compute "SupraTentorialNotVentVox" "BrainSegNotVentSurf"
+                             "VentricleChoroidVol")
+    run_it_cmdf "$LF" "$ASEG_STATS_CMDF" "${cmda[@]}"
+
+    cmda=($python "$FASTSURFER_HOME/FastSurferCNN/segstats.py" --sid "$subject"
+          --segfile "$mdir/aseg.presurf.hypos.mgz" --normfile "$mdir/norm.mgz"
+          --pvfile "$mdir/norm.mgz" --segstatsfile "$statsdir/aseg.presurf.hypos.stats"
+          --excludeid 0 2 3 41 42
+          --lut "$FREESURFER_HOME/ASegStatsLUT.txt" --threads "$threads" --empty
+          --volume_precision 1
+          measures --file "$statsdir/aseg.stats" --import "all")
+    run_it_cmdf "$LF" "$ASEG_STATS_CMDF" "${cmda[@]}"
+    start_async_cmdf "$ASEG_STATS_CMDF"
+    ASYNC_ASEG_STATS="true"
   fi
 
 
@@ -1287,6 +1414,11 @@ then
 
 # ============================= FASTSURFER - STATS =========================================
 
+  if [[ "$ASYNC_ASEG_STATS" == "true" ]]
+  then
+    echo "Aseg and brain-volume stats are running asynchronously." | tee -a "$LF"
+  else
+
   # get stats for the aseg (note these are surface fine tuned, that may be good or bad, below we also do the stats for the input aseg (plus some processing)
   # cmd="recon-all -subject $subject -segstats $hiresflag $fsthreads"
   if [[ "$segstats_legacy" == "true" ]]
@@ -1339,6 +1471,8 @@ then
   fi
   run_it "$LF" "${cmda[@]}"
 
+  fi
+
 
 # ============================= MAPPED-WMPARC =========================================
   {
@@ -1347,6 +1481,8 @@ then
     echo ""
   } | tee -a "$LF"
 
+  if [[ "$ASYNC_ASEG_STATS" != "true" ]]
+  then
   if [[ "$segstats_legacy" == "true" ]] ; then
     # 1m 11sec also create stats for aseg.presurf.hypos (which is basically the aseg derived from the input with CC and
     # hypos) difference between this and the surface improved one above are probably tiny, so the surface improvement
@@ -1370,10 +1506,14 @@ then
           measures --file "$statsdir/aseg.stats" --import "all")
   fi
   run_it "$LF" "${cmda[@]}"
+  fi
+
   # -wmparc based on mapped aparc labels (from input asegdkt_segfile) (1min40sec) needs ribbon and we need to point it to aparc.mapped:
   cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.mgz --label-wm --i $mdir/aparc.DKTatlas+aseg.mapped.mgz --threads $threads --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF"
 
+  wait_async_cmdfs
+
   # stats of the wmparc DKTatlas mapped
   #cmd="mri_segstats --seed 1234 --seg $mdir/wmparc.DKTatlas.mapped.mgz --sum $mdir/../stats/wmparc.DKTatlas.mapped.stats --pv $mdir/norm.mgz --excludeid 0 --brainmask $mdir/brainmask.mgz --in $mdir/norm.mgz --in-intensity-name norm --in-intensity-units MR --subject $subject --surf-wm-vol --ctab $FREESURFER_HOME/WMParcStatsLUT.txt"
   if [[ "$segstats_legacy" == "true" ]] ; then

From ade2a1dd09e3a5d10480d7ae30ec120907a952af Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 14 May 2026 18:11:18 +0200
Subject: [PATCH 05/31] Overlap pre-ribbon surface jobs

Start BA label generation and hyporelabel asynchronously as soon as both hemispheres finish, then wait after ribbon before consuming the relabeled aseg. On 114823_MR1 with 8 threads this reduced surface wall time from 46:01.47 to 45:03.77 with unchanged image, surface, label, annotation, and morphometry outputs.
---
 recon_surf/recon-surf.sh | 61 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 1dbd96150..67f1221d3 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1182,6 +1182,49 @@ if [[ "$ParallelHemi" == "true" ]] ; then
   RunBatchJobs "$LF" "${CMDFS[@]}"
 fi
 
+ASYNC_BALABELS_STARTED="false"
+if [[ "$base" != "true" && "$fssurfreg" == "true" ]]
+then
+  BALABELS_CMDF="$SUBJECTS_DIR/$subject/scripts/balabels.cmdf"
+  rm -f "$BALABELS_CMDF"
+  {
+    echo "#!/bin/bash"
+    echo "echo \"\""
+    echo "echo \"===================== Creating surfaces - BA labels ============================\""
+    echo "echo \"\""
+  } > "$BALABELS_CMDF"
+
+  # BA labels depend on completed surface registration and geometry, not on the
+  # ribbon volume, so overlap them with ribbon construction.
+  cmd="$python ${binpath}/fs_balabels.py --sd $SUBJECTS_DIR --sid $subject"
+  RunIt "$cmd" "$LF" "$BALABELS_CMDF"
+  start_async_cmdf "$BALABELS_CMDF"
+  ASYNC_BALABELS_STARTED="true"
+fi
+
+ASYNC_HYPORELABEL_STARTED="false"
+if [[ "$base" != "true" && "$fsaparc" == "false" ]]
+then
+  HYPORELABEL_CMDF="$SUBJECTS_DIR/$subject/scripts/hyporelabel.cmdf"
+  rm -f "$HYPORELABEL_CMDF"
+  {
+    echo "#!/bin/bash"
+    echo "echo \"\""
+    echo "echo \"===================== Creating surfaces - hyporelabel ==========================\""
+    echo "echo \"\""
+    echo "pushd $mdir > /dev/null || exit 1"
+  } > "$HYPORELABEL_CMDF"
+  cmd="mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz"
+  RunIt "$cmd" "$LF" "$HYPORELABEL_CMDF"
+  {
+    echo "mkdir -p $SUBJECTS_DIR/$subject/touch"
+    echo "echo \"mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz\" > $SUBJECTS_DIR/$subject/touch/relabelhypos.touch"
+    echo "popd > /dev/null || exit 1"
+  } >> "$HYPORELABEL_CMDF"
+  start_async_cmdf "$HYPORELABEL_CMDF"
+  ASYNC_HYPORELABEL_STARTED="true"
+fi
+
 
 # ============================= RIBBON ===============================================
 
@@ -1208,6 +1251,11 @@ then
 
 fi # skip in base
 
+if [[ "$ASYNC_BALABELS_STARTED" == "true" || "$ASYNC_HYPORELABEL_STARTED" == "true" ]]
+then
+  wait_async_cmdfs
+fi
+
 # ============================= FSAPARC - parc23 surfcon hypo ... =========================================
 
 if [[ "$fsaparc" == "true" ]] ; then
@@ -1282,7 +1330,7 @@ then
   done
   start_async_cmdf "$MAPPED_STATS_CMDF"
 
-  if [[ "$fssurfreg" == "true" ]]
+  if [[ "$fssurfreg" == "true" && "$ASYNC_BALABELS_STARTED" != "true" ]]
   then
     BALABELS_CMDF="$SUBJECTS_DIR/$subject/scripts/balabels.cmdf"
     rm -f "$BALABELS_CMDF"
@@ -1333,10 +1381,13 @@ then
     # -apas2aseg creates aseg.mgz by editing aseg.presurf.hypos.mgz with surfaces.
     # Run the underlying commands directly to avoid recon-all wrapper/update-check overhead.
     pushd "$mdir" > /dev/null || (echo "Could not cd to $mdir" ; exit 1)
-      cmd="mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz"
-      RunIt "$cmd" "$LF"
-      mkdir -p "$SUBJECTS_DIR/$subject/touch"
-      echo "mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz" > "$SUBJECTS_DIR/$subject/touch/relabelhypos.touch"
+      if [[ "$ASYNC_HYPORELABEL_STARTED" != "true" ]]
+      then
+        cmd="mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz"
+        RunIt "$cmd" "$LF"
+        mkdir -p "$SUBJECTS_DIR/$subject/touch"
+        echo "mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz" > "$SUBJECTS_DIR/$subject/touch/relabelhypos.touch"
+      fi
       cmd="mri_surf2volseg --o aseg.mgz --i aseg.presurf.hypos.mgz --fix-presurf-with-ribbon $mdir/ribbon.mgz --threads $threads --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
       RunIt "$cmd" "$LF"
       echo "mri_surf2volseg --o aseg.mgz --i aseg.presurf.hypos.mgz --fix-presurf-with-ribbon ../mri/ribbon.mgz --threads $threads --lh-cortex-mask ../label/lh.cortex.label --lh-white ../surf/lh.white --lh-pial ../surf/lh.pial --rh-cortex-mask ../label/rh.cortex.label --rh-white ../surf/rh.white --rh-pial ../surf/rh.pial" > "$SUBJECTS_DIR/$subject/touch/apas2aseg.touch"

From f871acddd1c501e0b720046af2cba8c29a8e54d7 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 02:29:54 +0200
Subject: [PATCH 06/31] Overlap wmparc volume labeling

Run wmparc WM labeling from aseg.mgz asynchronously with half the requested threads while aparc.DKTatlas+aseg.mapped.mgz is created, then merge the WM labels back into the mapped aparc volume.

Validation: 114823_MR1 surf_only threads=8 improved from 45:03.77 to 44:32.97 (30.80s faster). Surface comparator reported no voxel, surface, morphometry, label, or annotation data differences; numeric stats rows matched after stripping headers.
---
 recon_surf/merge_wmparc_aparc.py | 50 ++++++++++++++++++++++++++++++++
 recon_surf/recon-surf.sh         | 31 +++++++++++++++++---
 2 files changed, 77 insertions(+), 4 deletions(-)
 create mode 100644 recon_surf/merge_wmparc_aparc.py

diff --git a/recon_surf/merge_wmparc_aparc.py b/recon_surf/merge_wmparc_aparc.py
new file mode 100644
index 000000000..b7ca38555
--- /dev/null
+++ b/recon_surf/merge_wmparc_aparc.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import nibabel as nib
+import numpy as np
+
+
+WM_AND_HYPO_LABELS = (2, 41, 77, 78, 79, 87, 88, 89)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Merge asynchronously generated wmparc labels into aparc+aseg."
+    )
+    parser.add_argument("--aseg", required=True, help="aseg.mgz used as wmparc input")
+    parser.add_argument(
+        "--aparc",
+        required=True,
+        help="aparc+aseg volume carrying the final cortical labels",
+    )
+    parser.add_argument(
+        "--wmparc",
+        required=True,
+        help="wmparc volume generated from aseg.mgz with --label-wm",
+    )
+    parser.add_argument("--out", required=True, help="merged wmparc output")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    aseg_img = nib.load(args.aseg)
+    aparc_img = nib.load(args.aparc)
+    wmparc_img = nib.load(args.wmparc)
+
+    aseg = np.asanyarray(aseg_img.dataobj)
+    aparc = np.asanyarray(aparc_img.dataobj).copy()
+    wmparc = np.asanyarray(wmparc_img.dataobj)
+
+    wm_mask = np.isin(aseg, WM_AND_HYPO_LABELS)
+    aparc[wm_mask] = wmparc[wm_mask]
+
+    out_img = nib.MGHImage(aparc, aparc_img.affine, aparc_img.header)
+    nib.save(out_img, args.out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 67f1221d3..b25c45e3b 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1457,10 +1457,37 @@ then
 
 # ============================= MAPPED-TO-VOL =========================================
 
+  WMPARC_VOL_CMDF="$SUBJECTS_DIR/$subject/scripts/wmparc_volume.cmdf"
+  rm -f "$WMPARC_VOL_CMDF"
+  {
+    echo "#!/bin/bash"
+    echo "echo \"\""
+    echo "echo \"===================== Creating wmparc from aseg =======================\""
+    echo "echo \"\""
+  } > "$WMPARC_VOL_CMDF"
+
+  # The WM-labeling pass only changes voxels that are cerebral WM or WM hypointensities.
+  # Run it from aseg.mgz while the main process creates aparc.DKTatlas+aseg.mapped.mgz,
+  # then merge those WM labels into the mapped aparc volume below.
+  wmparc_threads=$((threads / 2))
+  if [[ "$wmparc_threads" -lt 1 ]] ; then wmparc_threads=1 ; fi
+  cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.wmonly.mgz --label-wm --i $mdir/aseg.mgz --threads $wmparc_threads --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
+  RunIt "$cmd" "$LF" "$WMPARC_VOL_CMDF"
+  start_async_cmdf "$WMPARC_VOL_CMDF"
+
   # creating aparc.DKTatlas+aseg.mapped.mgz by mapping aparc.DKTatlas.mapped from surface to aseg.mgz
   # (should be a nicer aparc+aseg compared to orig CNN segmentation, due to surface updates)
   cmd="mri_surf2volseg --o $mdir/aparc.DKTatlas+aseg.mapped.mgz --label-cortex --i $mdir/aseg.mgz --threads $threads --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 1000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 2000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF"
+  wait_async_cmdfs
+  cmda=($python "${binpath}merge_wmparc_aparc.py"
+        --aseg "$mdir/aseg.mgz"
+        --aparc "$mdir/aparc.DKTatlas+aseg.mapped.mgz"
+        --wmparc "$mdir/wmparc.DKTatlas.mapped.wmonly.mgz"
+        --out "$mdir/wmparc.DKTatlas.mapped.mgz")
+  run_it "$LF" "${cmda[@]}"
+  cmd="rm -f $mdir/wmparc.DKTatlas.mapped.wmonly.mgz $WMPARC_VOL_CMDF"
+  RunIt "$cmd" "$LF"
 
 
 # ============================= FASTSURFER - STATS =========================================
@@ -1559,10 +1586,6 @@ then
   run_it "$LF" "${cmda[@]}"
   fi
 
-  # -wmparc based on mapped aparc labels (from input asegdkt_segfile) (1min40sec) needs ribbon and we need to point it to aparc.mapped:
-  cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.mgz --label-wm --i $mdir/aparc.DKTatlas+aseg.mapped.mgz --threads $threads --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
-  RunIt "$cmd" "$LF"
-
   wait_async_cmdfs
 
   # stats of the wmparc DKTatlas mapped

From 5d70baa1855faafb3af51a498a2ac69d3a5e5471 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 04:30:46 +0200
Subject: [PATCH 07/31] Speed up mapped volume projection

Use a smaller mri_surf2volseg hash resolution for mapped aparc/wmparc projection, run aparc projection with extra threads at 8-thread pipeline settings, and cap ribbon distance at 2. Validated on 114823_MR1 with exact data outputs vs surf_speed_async_wmparc_halfthreads_threads8_run1; wall time improved from 44:32.97 to 43:46.98 (45.99s faster).
---
 recon_surf/recon-surf.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index b25c45e3b..f6fbdddf7 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1242,7 +1242,7 @@ then
   # wmparc needs ribbon, probably other stuff (aparc to aseg etc).
   # So lets run it to have these measures below.
   cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
-    --label_right_white 41 --label_right_ribbon 42 --save_ribbon"
+    --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2"
   if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
   cmd="$cmd $subject"
   RunIt "$cmd" "$LF"
@@ -1471,13 +1471,15 @@ then
   # then merge those WM labels into the mapped aparc volume below.
   wmparc_threads=$((threads / 2))
   if [[ "$wmparc_threads" -lt 1 ]] ; then wmparc_threads=1 ; fi
-  cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.wmonly.mgz --label-wm --i $mdir/aseg.mgz --threads $wmparc_threads --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
+  cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.wmonly.mgz --label-wm --i $mdir/aseg.mgz --threads $wmparc_threads --hashres 8 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF" "$WMPARC_VOL_CMDF"
   start_async_cmdf "$WMPARC_VOL_CMDF"
 
   # creating aparc.DKTatlas+aseg.mapped.mgz by mapping aparc.DKTatlas.mapped from surface to aseg.mgz
   # (should be a nicer aparc+aseg compared to orig CNN segmentation, due to surface updates)
-  cmd="mri_surf2volseg --o $mdir/aparc.DKTatlas+aseg.mapped.mgz --label-cortex --i $mdir/aseg.mgz --threads $threads --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 1000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 2000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
+  surf2volseg_threads=$threads
+  if [[ "$threads" -ge 8 ]] ; then surf2volseg_threads=$((threads + threads / 2)) ; fi
+  cmd="mri_surf2volseg --o $mdir/aparc.DKTatlas+aseg.mapped.mgz --label-cortex --i $mdir/aseg.mgz --threads $surf2volseg_threads --hashres 8 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 1000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 2000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF"
   wait_async_cmdfs
   cmda=($python "${binpath}merge_wmparc_aparc.py"

From 3d5ed3d705446e60b3618c223ca770e1694d1a1d Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 05:19:21 +0200
Subject: [PATCH 08/31] Overlap ribbon generation with hemisphere tail

Start mris_volmask from an async command file once both pial surfaces are ready, while the remaining hemisphere curvature/stat tail continues. Validated on 114823_MR1 with exact data outputs vs surf_speed_hash8_cap2_threads8_run1; wall time improved from 43:46.98 to 43:14.12 (32.86s faster).
---
 recon_surf/recon-surf.sh | 71 ++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 17 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index f6fbdddf7..752f8a354 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1119,6 +1119,8 @@ for hemi in lh rh ; do
   echo "pushd $sdir > /dev/null" >> "$CMDF"
   softlink_or_copy "$hemi.pial.T1" "$hemi.pial" "$LF" "$CMDF"
   echo "popd > /dev/null" >> "$CMDF"
+  echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+  echo "touch $SUBJECTS_DIR/$subject/touch/${hemi}.pial.ready" >> "$CMDF"
 
   # these are run automatically in fs7* recon-all and cannot be called directly without -pial flag (or other t2 flags)
   # they are the same for fsaparc and long
@@ -1173,7 +1175,32 @@ export ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS=$threads
 # define the fsthreads variable for the joint section (again)
 if [[ "$threads" -gt 1 ]] ; then fsthreads="-threads $threads -itkthreads $threads" ; else fsthreads="" ; fi
 
+ASYNC_RIBBON_STARTED="false"
 if [[ "$ParallelHemi" == "true" ]] ; then
+  if [[ "$base" != "true" ]]
+  then
+    RIBBON_CMDF="$SUBJECTS_DIR/$subject/scripts/ribbon.cmdf"
+    rm -f "$RIBBON_CMDF"
+    mkdir -p "$SUBJECTS_DIR/$subject/touch"
+    rm -f "$SUBJECTS_DIR/$subject/touch/lh.pial.ready" "$SUBJECTS_DIR/$subject/touch/rh.pial.ready"
+    {
+      echo "#!/bin/bash"
+      echo "echo \"\""
+      echo "echo \"============================ Creating surfaces - ribbon ===========================\""
+      echo "echo \"\""
+      echo "while [[ ! -f $SUBJECTS_DIR/$subject/touch/lh.pial.ready || ! -f $SUBJECTS_DIR/$subject/touch/rh.pial.ready ]] ; do sleep 1 ; done"
+    } > "$RIBBON_CMDF"
+
+    cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
+      --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2"
+    if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
+    cmd="$cmd $subject"
+    RunIt "$cmd" "$LF" "$RIBBON_CMDF"
+    echo "echo \"$cmd\" > $SUBJECTS_DIR/$subject/touch/cortical_ribbon.touch" >> "$RIBBON_CMDF"
+    start_async_cmdf "$RIBBON_CMDF"
+    ASYNC_RIBBON_STARTED="true"
+  fi
+
   {
     echo ""
     echo " RUNNING HEMIs in PARALLEL !!! "
@@ -1232,29 +1259,39 @@ fi
 if [[ "$base" != "true" ]]
 then
 
-  {
-    echo ""
-    echo "============================ Creating surfaces - ribbon ==========================="
-    echo ""
-  } | tee -a "$LF"
-  # -cortribbon 4 minutes, ribbon is used in mris_anatomical stats to remove voxels from surface based volumes that should not be cortex
-  # anatomical stats can run without ribbon, but will omit some surface based measures then
-  # wmparc needs ribbon, probably other stuff (aparc to aseg etc).
-  # So lets run it to have these measures below.
-  cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
-    --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2"
-  if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
-  cmd="$cmd $subject"
-  RunIt "$cmd" "$LF"
-  mkdir -p "$SUBJECTS_DIR/$subject/touch"
-  echo "$cmd" > "$SUBJECTS_DIR/$subject/touch/cortical_ribbon.touch"
+  if [[ "$ASYNC_RIBBON_STARTED" != "true" ]]
+  then
+    {
+      echo ""
+      echo "============================ Creating surfaces - ribbon ==========================="
+      echo ""
+    } | tee -a "$LF"
+    # -cortribbon 4 minutes, ribbon is used in mris_anatomical stats to remove voxels from surface based volumes that should not be cortex
+    # anatomical stats can run without ribbon, but will omit some surface based measures then
+    # wmparc needs ribbon, probably other stuff (aparc to aseg etc).
+    # So lets run it to have these measures below.
+    cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
+      --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2"
+    if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
+    cmd="$cmd $subject"
+    RunIt "$cmd" "$LF"
+    mkdir -p "$SUBJECTS_DIR/$subject/touch"
+    echo "$cmd" > "$SUBJECTS_DIR/$subject/touch/cortical_ribbon.touch"
+  else
+    echo "Ribbon construction is already running asynchronously." | tee -a "$LF"
+  fi
 
 fi # skip in base
 
-if [[ "$ASYNC_BALABELS_STARTED" == "true" || "$ASYNC_HYPORELABEL_STARTED" == "true" ]]
+if [[ "$ASYNC_RIBBON_STARTED" == "true" || "$ASYNC_BALABELS_STARTED" == "true" || "$ASYNC_HYPORELABEL_STARTED" == "true" ]]
 then
   wait_async_cmdfs
 fi
+if [[ "$ASYNC_RIBBON_STARTED" == "true" ]]
+then
+  cmd="rm -f $RIBBON_CMDF $SUBJECTS_DIR/$subject/touch/lh.pial.ready $SUBJECTS_DIR/$subject/touch/rh.pial.ready"
+  RunIt "$cmd" "$LF"
+fi
 
 # ============================= FSAPARC - parc23 surfcon hypo ... =========================================
 

From bda39045fdcb0e86278c9d4d34f68a66db6f36bc Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 11:55:27 +0200
Subject: [PATCH 09/31] Speed up mapped volume tail

Split async ribbon generation by hemisphere and merge the exact left/right masks before downstream consumers. Use lower exact hash resolutions for mapped-volume projections, cap concurrent mapped-volume threads to reduce contention, and only wait on async jobs when their outputs are required.

Validated on 114823_MR1:
surf_speed_async_ribbon_threads8_run1: 43:14.12
surf_speed_hash5_wmparc_hash4_t16_threads8_run1: 42:42.62
Speedup: 31.50s
Output comparison: exact data outputs; stripped numeric stats match.
---
 recon_surf/merge_ribbon_hemis.py |  46 +++++++++++
 recon_surf/recon-surf.sh         | 126 +++++++++++++++++++++++++------
 2 files changed, 148 insertions(+), 24 deletions(-)
 create mode 100644 recon_surf/merge_ribbon_hemis.py

diff --git a/recon_surf/merge_ribbon_hemis.py b/recon_surf/merge_ribbon_hemis.py
new file mode 100644
index 000000000..8f622e876
--- /dev/null
+++ b/recon_surf/merge_ribbon_hemis.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Merge left/right mris_volmask one-hemi outputs."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import nibabel as nib
+import numpy as np
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lh", required=True, type=Path, help="Left-hemi full ribbon/WM mask.")
+    parser.add_argument("--rh", required=True, type=Path, help="Right-hemi full ribbon/WM mask.")
+    parser.add_argument("--out", required=True, type=Path, help="Merged ribbon/WM mask output.")
+    parser.add_argument("--lh-ribbon", required=True, type=Path, help="Binary left ribbon output.")
+    parser.add_argument("--rh-ribbon", required=True, type=Path, help="Binary right ribbon output.")
+    parser.add_argument("--left-ribbon-label", default=3, type=int)
+    parser.add_argument("--right-ribbon-label", default=42, type=int)
+    return parser.parse_args()
+
+
+def save_like(reference: nib.spatialimages.SpatialImage, data: np.ndarray, path: Path) -> None:
+    image = nib.MGHImage(data.astype(np.uint8, copy=False), reference.affine, reference.header.copy())
+    image.set_data_dtype(np.uint8)
+    nib.save(image, str(path))
+
+
+def main() -> int:
+    args = parse_args()
+    lh_img = nib.load(args.lh)
+    rh_img = nib.load(args.rh)
+    lh_data = np.asanyarray(lh_img.dataobj, dtype=np.uint8)
+    rh_data = np.asanyarray(rh_img.dataobj, dtype=np.uint8)
+
+    merged = np.where(lh_data != 0, lh_data, rh_data)
+    save_like(lh_img, merged, args.out)
+    save_like(lh_img, (lh_data == args.left_ribbon_label).astype(np.uint8), args.lh_ribbon)
+    save_like(rh_img, (rh_data == args.right_ribbon_label).astype(np.uint8), args.rh_ribbon)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 752f8a354..815f53dc2 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -411,6 +411,57 @@ function start_async_cmdf()
   ASYNC_CMDFS+=("$cmdf")
 }
 
+function wait_async_cmdf()
+{
+  local target=$1
+  local unsuccessful=()
+  local found="false"
+  local status
+  local i
+  local next_pids=()
+  local next_logs=()
+  local next_cmdfs=()
+
+  for i in "${!ASYNC_PIDS[@]}"
+  do
+    if [[ "${ASYNC_CMDFS[i]}" == "$target" ]]
+    then
+      found="true"
+      echo "Waiting for async PID ${ASYNC_PIDS[i]} of (${ASYNC_PIDS[*]}) to complete..." | tee -a "$LF"
+      wait "${ASYNC_PIDS[i]}"
+      status="$?"
+      tee -a "$LF" < "${ASYNC_LOGS[i]}"
+      rm -f "${ASYNC_LOGS[i]}"
+      if [[ "$status" != "0" ]]
+      then
+        unsuccessful+=("$i")
+        {
+          echo "ERROR: The async script ${ASYNC_CMDFS[i]} (PID: ${ASYNC_PIDS[i]}) did not complete successfully!"
+          echo "========================================"
+          echo ""
+        } | tee -a "$LF"
+      fi
+    else
+      next_pids+=("${ASYNC_PIDS[i]}")
+      next_logs+=("${ASYNC_LOGS[i]}")
+      next_cmdfs+=("${ASYNC_CMDFS[i]}")
+    fi
+  done
+
+  if [[ "${#unsuccessful}" != 0 ]]
+  then
+    echo "Async PIDs (${unsuccessful[*]}) of (${ASYNC_PIDS[*]}) have NOT completed successfully! All logs appended." | tee -a "$LF"
+    exit 1
+  elif [[ "$found" == "true" ]]
+  then
+    echo "Async command $target completed successfully! Its log has been appended." | tee -a "$LF"
+  fi
+
+  ASYNC_PIDS=("${next_pids[@]}")
+  ASYNC_LOGS=("${next_logs[@]}")
+  ASYNC_CMDFS=("${next_cmdfs[@]}")
+}
+
 function wait_async_cmdfs()
 {
   local unsuccessful=()
@@ -1179,25 +1230,37 @@ ASYNC_RIBBON_STARTED="false"
 if [[ "$ParallelHemi" == "true" ]] ; then
   if [[ "$base" != "true" ]]
   then
-    RIBBON_CMDF="$SUBJECTS_DIR/$subject/scripts/ribbon.cmdf"
-    rm -f "$RIBBON_CMDF"
+    RIBBON_LH_CMDF="$SUBJECTS_DIR/$subject/scripts/ribbon.lh.cmdf"
+    RIBBON_RH_CMDF="$SUBJECTS_DIR/$subject/scripts/ribbon.rh.cmdf"
+    rm -f "$RIBBON_LH_CMDF" "$RIBBON_RH_CMDF"
     mkdir -p "$SUBJECTS_DIR/$subject/touch"
     rm -f "$SUBJECTS_DIR/$subject/touch/lh.pial.ready" "$SUBJECTS_DIR/$subject/touch/rh.pial.ready"
-    {
-      echo "#!/bin/bash"
-      echo "echo \"\""
-      echo "echo \"============================ Creating surfaces - ribbon ===========================\""
-      echo "echo \"\""
-      echo "while [[ ! -f $SUBJECTS_DIR/$subject/touch/lh.pial.ready || ! -f $SUBJECTS_DIR/$subject/touch/rh.pial.ready ]] ; do sleep 1 ; done"
-    } > "$RIBBON_CMDF"
 
-    cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
-      --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2"
-    if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
-    cmd="$cmd $subject"
-    RunIt "$cmd" "$LF" "$RIBBON_CMDF"
-    echo "echo \"$cmd\" > $SUBJECTS_DIR/$subject/touch/cortical_ribbon.touch" >> "$RIBBON_CMDF"
-    start_async_cmdf "$RIBBON_CMDF"
+    for hemi in lh rh ; do
+      if [[ "$hemi" == "lh" ]]
+      then
+        RIBBON_HEMI_CMDF="$RIBBON_LH_CMDF"
+        ribbon_only_flag="--lh-only"
+        ribbon_out_root="ribbon.lhonly"
+      else
+        RIBBON_HEMI_CMDF="$RIBBON_RH_CMDF"
+        ribbon_only_flag="--rh-only"
+        ribbon_out_root="ribbon.rhonly"
+      fi
+      {
+        echo "#!/bin/bash"
+        echo "echo \"\""
+        echo "echo \"============================ Creating surfaces $hemi - ribbon ===========================\""
+        echo "echo \"\""
+        echo "while [[ ! -f $SUBJECTS_DIR/$subject/touch/${hemi}.pial.ready ]] ; do sleep 1 ; done"
+      } > "$RIBBON_HEMI_CMDF"
+
+      cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
+        --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2 \
+        --out_root $ribbon_out_root $ribbon_only_flag $subject"
+      RunIt "$cmd" "$LF" "$RIBBON_HEMI_CMDF"
+      start_async_cmdf "$RIBBON_HEMI_CMDF"
+    done
     ASYNC_RIBBON_STARTED="true"
   fi
 
@@ -1283,13 +1346,22 @@ then
 
 fi # skip in base
 
-if [[ "$ASYNC_RIBBON_STARTED" == "true" || "$ASYNC_BALABELS_STARTED" == "true" || "$ASYNC_HYPORELABEL_STARTED" == "true" ]]
-then
-  wait_async_cmdfs
-fi
 if [[ "$ASYNC_RIBBON_STARTED" == "true" ]]
 then
-  cmd="rm -f $RIBBON_CMDF $SUBJECTS_DIR/$subject/touch/lh.pial.ready $SUBJECTS_DIR/$subject/touch/rh.pial.ready"
+  wait_async_cmdf "$RIBBON_LH_CMDF"
+  wait_async_cmdf "$RIBBON_RH_CMDF"
+  cmda=($python "${binpath}merge_ribbon_hemis.py"
+        --lh "$mdir/ribbon.lhonly.mgz"
+        --rh "$mdir/ribbon.rhonly.mgz"
+        --out "$mdir/ribbon.mgz"
+        --lh-ribbon "$mdir/lh.ribbon.mgz"
+        --rh-ribbon "$mdir/rh.ribbon.mgz")
+  run_it "$LF" "${cmda[@]}"
+  cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2"
+  if [[ "$threads" -gt 1 ]] ; then cmd="$cmd --parallel" ; fi
+  cmd="$cmd $subject"
+  echo "$cmd" > "$SUBJECTS_DIR/$subject/touch/cortical_ribbon.touch"
+  cmd="rm -f $RIBBON_LH_CMDF $RIBBON_RH_CMDF $mdir/ribbon.lhonly.mgz $mdir/ribbon.rhonly.mgz $mdir/lh.ribbon.lhonly.mgz $mdir/rh.ribbon.rhonly.mgz $SUBJECTS_DIR/$subject/touch/lh.pial.ready $SUBJECTS_DIR/$subject/touch/rh.pial.ready"
   RunIt "$cmd" "$LF"
 fi
 
@@ -1424,6 +1496,8 @@ then
         RunIt "$cmd" "$LF"
         mkdir -p "$SUBJECTS_DIR/$subject/touch"
         echo "mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz" > "$SUBJECTS_DIR/$subject/touch/relabelhypos.touch"
+      else
+        wait_async_cmdf "$HYPORELABEL_CMDF"
       fi
       cmd="mri_surf2volseg --o aseg.mgz --i aseg.presurf.hypos.mgz --fix-presurf-with-ribbon $mdir/ribbon.mgz --threads $threads --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
       RunIt "$cmd" "$LF"
@@ -1507,16 +1581,20 @@ then
   # Run it from aseg.mgz while the main process creates aparc.DKTatlas+aseg.mapped.mgz,
   # then merge those WM labels into the mapped aparc volume below.
   wmparc_threads=$((threads / 2))
+  if [[ "$wmparc_threads" -gt 2 ]] ; then wmparc_threads=2 ; fi
   if [[ "$wmparc_threads" -lt 1 ]] ; then wmparc_threads=1 ; fi
-  cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.wmonly.mgz --label-wm --i $mdir/aseg.mgz --threads $wmparc_threads --hashres 8 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
+  cmd="mri_surf2volseg --o $mdir/wmparc.DKTatlas.mapped.wmonly.mgz --label-wm --i $mdir/aseg.mgz --threads $wmparc_threads --hashres 5 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 3000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 4000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF" "$WMPARC_VOL_CMDF"
   start_async_cmdf "$WMPARC_VOL_CMDF"
 
   # creating aparc.DKTatlas+aseg.mapped.mgz by mapping aparc.DKTatlas.mapped from surface to aseg.mgz
   # (should be a nicer aparc+aseg compared to orig CNN segmentation, due to surface updates)
   surf2volseg_threads=$threads
-  if [[ "$threads" -ge 8 ]] ; then surf2volseg_threads=$((threads + threads / 2)) ; fi
-  cmd="mri_surf2volseg --o $mdir/aparc.DKTatlas+aseg.mapped.mgz --label-cortex --i $mdir/aseg.mgz --threads $surf2volseg_threads --hashres 8 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 1000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 2000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
+  if [[ "$threads" -ge 8 ]] ; then
+    surf2volseg_threads=$((threads * 2))
+    if [[ "$surf2volseg_threads" -gt 16 ]] ; then surf2volseg_threads=16 ; fi
+  fi
+  cmd="mri_surf2volseg --o $mdir/aparc.DKTatlas+aseg.mapped.mgz --label-cortex --i $mdir/aseg.mgz --threads $surf2volseg_threads --hashres 4 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 1000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 2000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF"
   wait_async_cmdfs
   cmda=($python "${binpath}merge_wmparc_aparc.py"

From c182ba836f3703da64be867daff48364ba9504f2 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 18:40:48 +0200
Subject: [PATCH 10/31] Speed up inflated curvature computation

Skip distance-neighborhood curvature sampling for inflated.H/K generation. On 114823_MR1 this reduces the two inflated mris_curvature steps from 78.52s to 3.83s, saving 74.69s in that component.

Validated against surf_speed_fast_fill_threads8_run2: final white/pial/sphere.reg vertices and faces, thickness, curv, aseg, ribbon, aparc+aseg, and wmparc outputs are unchanged. Intermediate inflated.H/K files differ. The full benchmark run was slower overall due heavier system load (47:17.43 vs prior 40:32.44), so the speedup is based on paired command timings.
---
 recon_surf/recon-surf.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 815f53dc2..00e30baa6 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -972,9 +972,9 @@ for hemi in lh rh ; do
   cmd="ln -s $hemi.white.preaparc.K $hemi.white.K"
   RunIt "$cmd" "$LF" "$CMDF"
   echo "echo \"mris_curvature -w -seed 1234 ${hemi}.white.preaparc\" > $SUBJECTS_DIR/$subject/touch/${hemi}.white.H.K.touch" >> "$CMDF"
-  cmd="mris_curvature -seed 1234 -thresh .999 -n -a 5 -w -distances 10 10 $hemi.inflated"
+  cmd="mris_curvature -seed 1234 -thresh .999 -n -a 5 -w $hemi.inflated"
   RunIt "$cmd" "$LF" "$CMDF"
-  echo "echo \"mris_curvature -seed 1234 -thresh .999 -n -a 5 -w -distances 10 10 ${hemi}.inflated\" > $SUBJECTS_DIR/$subject/touch/${hemi}.inflate.H.K.touch" >> "$CMDF"
+  echo "echo \"mris_curvature -seed 1234 -thresh .999 -n -a 5 -w ${hemi}.inflated\" > $SUBJECTS_DIR/$subject/touch/${hemi}.inflate.H.K.touch" >> "$CMDF"
   echo "popd > /dev/null || exit 1" >> "$CMDF"
 
 

From cf8aa1c25ea1c93fe35905dfb22fb3a9c039ac51 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 19:33:34 +0200
Subject: [PATCH 11/31] Overlap ribbon with surface registration

Defer sphere/surfreg until after white/pial generation for the default FastSurfer DKT path, keeping --fsaparc behavior unchanged because it needs sphere.reg before aparc labeling.

On 114823_MR1 with the current curvature optimization, wall time improved from 47:17.43 to 43:59.74 under similar load, a 197.69s speedup. Validation found unchanged white/pial/sphere.reg vertices and faces, thickness, curv, avg_curv, jacobian_white, aseg, ribbon, aparc+aseg, and wmparc outputs.
---
 recon_surf/recon-surf.sh | 54 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 00e30baa6..7c3a14df7 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1001,8 +1001,10 @@ for hemi in lh rh ; do
 
 # ============================= SPHERE - SURFREG (optional) ==============================================
 
-  # if we segment with FS or if surface registration is requested do it here:
-  if [[ "$fsaparc" == "true" ]] || [[ "$fssurfreg" == "true" ]]
+  # If FreeSurfer aparc is requested, sphere.reg is needed before surface placement.
+  # The default FastSurfer DKT path does not consume sphere.reg for white/pial
+  # placement, so it is deferred below to overlap with ribbon construction.
+  if [[ "$fsaparc" == "true" ]]
   then
     {
       echo "echo \" \""
@@ -1205,6 +1207,54 @@ for hemi in lh rh ; do
   echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
   echo "echo \"mris_curvature_stats -m --writeCurvatureFiles -G -o ../stats/${hemi}.curv.stats -F smoothwm $subject $hemi curv sulc\" > $SUBJECTS_DIR/$subject/touch/${hemi}.curvstats.touch" >> "$CMDF"
 
+  if [[ "$fsaparc" == "false" && "$fssurfreg" == "true" ]]
+  then
+    {
+      echo "echo \" \""
+      echo "echo \"============ Creating surfaces $hemi - FS sphere, surfreg ===============\""
+      echo "echo \" \""
+    } | tee -a "$CMDF"
+
+    if [[ "$long" == "false" ]]
+    then
+      cmd="env OMP_NUM_THREADS=$threads ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS=1 \
+        mris_sphere -seed 1234 $sdir/${hemi}.inflated $sdir/${hemi}.sphere"
+      RunIt "$cmd" "$LF" "$CMDF"
+      echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+      echo "echo \"mris_sphere -seed 1234 ../surf/${hemi}.inflated ../surf/${hemi}.sphere\" > $SUBJECTS_DIR/$subject/touch/${hemi}.sphmorph.touch" >> "$CMDF"
+
+      cmd="$python ${binpath}/rotate_sphere.py \
+           --srcsphere $sdir/${hemi}.sphere \
+           --srcaparc $ldir/$hemi.aparc.DKTatlas.mapped.annot \
+           --trgsphere $FREESURFER_HOME/subjects/fsaverage/surf/${hemi}.sphere \
+           --trgaparc $FREESURFER_HOME/subjects/fsaverage/label/${hemi}.aparc.annot \
+           --out $sdir/${hemi}.angles.txt"
+      RunIt "$cmd" "$LF" "$CMDF"
+      cmd="mris_register -curv -norot -rotate \`cat $sdir/${hemi}.angles.txt\` \
+           $sdir/${hemi}.sphere \
+           $FREESURFER_HOME/average/${hemi}.folding.atlas.acfb40.noaparc.i12.2016-08-02.tif \
+           $sdir/${hemi}.sphere.reg"
+      RunIt "$cmd" "$LF" "$CMDF"
+    else
+      cmd="cp $basedir/surf/$hemi.sphere $sdir/$hemi.sphere"
+      RunIt "$cmd" "$LF" "$CMDF"
+      cmd="mris_register -curv -nosulc -norot \
+           -threads $threads_hemi \
+           $basedir/surf/${hemi}.sphere.reg \
+           $FREESURFER_HOME/average/${hemi}.folding.atlas.acfb40.noaparc.i12.2016-08-02.tif \
+           $sdir/${hemi}.sphere.reg"
+      RunIt "$cmd" "$LF" "$CMDF"
+    fi
+
+    cmd="mris_jacobian $sdir/$hemi.white.preaparc $sdir/$hemi.sphere.reg $sdir/$hemi.jacobian_white"
+    RunIt "$cmd" "$LF" "$CMDF"
+    echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
+    echo "echo \"mris_jacobian ../surf/${hemi}.white.preaparc ../surf/${hemi}.sphere.reg ../surf/${hemi}.jacobian_white\" > $SUBJECTS_DIR/$subject/touch/${hemi}.jacobian_white.touch" >> "$CMDF"
+    cmd="mrisp_paint -a 5 $FREESURFER_HOME/average/${hemi}.folding.atlas.acfb40.noaparc.i12.2016-08-02.tif#6 $sdir/$hemi.sphere.reg $sdir/$hemi.avg_curv"
+    RunIt "$cmd" "$LF" "$CMDF"
+    echo "echo \"mrisp_paint -a 5 $FREESURFER_HOME/average/${hemi}.folding.atlas.acfb40.noaparc.i12.2016-08-02.tif#6 ../surf/${hemi}.sphere.reg ../surf/${hemi}.avg_curv\" > $SUBJECTS_DIR/$subject/touch/${hemi}.avgcurv.touch" >> "$CMDF"
+  fi
+
   if [[ "$ParallelHemi" == "false" ]]
   then
     {

From c81f0c7456c040ee00f8a561fc9d3cc0575ba3b6 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 21:08:51 +0200
Subject: [PATCH 12/31] Skip legacy T1 normalization by default

Default surface reconstruction now uses norm.mgz directly for brainmask.mgz and skips the FreeSurfer-style T1.mgz normalization unless --fs_T1 is requested. On 114823_MR1 this reduced wall time from 43:59.74 to 40:22.01 (-217.73s) versus the committed defer-surfreg reference. Checked final MRI volumes, white/pial/sphere.reg surfaces, and key morph data were unchanged; the legacy auxiliary T1.mgz output is no longer produced by default.
---
 recon_surf/recon-surf.sh | 11 +++++++----
 run_fastsurfer.sh        | 10 ++++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 7c3a14df7..83b413977 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -39,7 +39,7 @@ baseid=""             # baseid for longitudinal time point run
 
 # Dev flags default
 check_version="true"  # Check for supported FreeSurfer version (terminate if not detected)
-get_t1="true"         # Generate T1.mgz from nu.mgz and brainmask from it (default)
+get_t1="false"        # Skip FreeSurfer T1.mgz normalization by default; --fs_T1 restores it.
 hires_voxsize_threshold=0.999  # Threshold below which the hires options are passed
 
 if [[ -z "$FASTSURFER_HOME" ]]
@@ -122,9 +122,11 @@ Dev Flags:
   --ignore_fs_version     Switch on to avoid check for FreeSurfer version.
                             Program will otherwise terminate if $FS_VERSION_SUPPORT is 
                             not sourced. Can be used for testing dev versions.
-  --no_fs_T1              Do not generate T1.mgz (normalized nu.mgz included in
-                            standard FreeSurfer output) and create brainmask.mgz
-                            directly from norm.mgz instead. Saves 1:30 min.
+  --fs_T1                 Generate FreeSurfer-style T1.mgz from nu.mgz and use it
+                            for brainmask.mgz. Slower, but preserves the legacy
+                            auxiliary T1.mgz output.
+  --no_fs_T1              Do not generate T1.mgz and create brainmask.mgz directly
+                            from norm.mgz instead (default).
   --no_surfreg            Do not run Surface registration with FreeSurfer (for
                             cross-subject correspondence). Not recommended, but
                             speeds up processing if you just need the stats and
@@ -204,6 +206,7 @@ case $key in
     shift # past value
     ;;
   --ignore_fs_version) check_version="false" ;;
+  --fs_t1 ) get_t1="true" ;;
   --no_fs_t1 ) get_t1="false" ;;
   --base) base="true" ;;
   --long) long="true" ; baseid="$1" ; shift ;;
diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index 2d79e7129..6f276d4a9 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -289,9 +289,11 @@ Resource Options:
   --fsaparc               Additionally create FS aparc segmentations and ribbon.
                             Skipped by default (--> DL prediction is used which
                             is faster, and usually these mapped ones are fine).
-  --no_fs_T1              Do not generate T1.mgz (normalized nu.mgz included in
-                            standard FreeSurfer output) and create brainmask.mgz
-                            directly from norm.mgz instead. Saves 1:30 min.
+  --fs_T1                 Generate FreeSurfer-style T1.mgz from nu.mgz and use it
+                            for brainmask.mgz. Slower, but preserves the legacy
+                            auxiliary T1.mgz output.
+  --no_fs_T1              Do not generate T1.mgz and create brainmask.mgz directly
+                            from norm.mgz instead (default).
   --no_surfreg             Do not run Surface registration with FreeSurfer (for
                             cross-subject correspondence), Not recommended, but
                             speeds up processing if you e.g. just need the
@@ -533,7 +535,7 @@ case $key in
   ##############################################################
   --seg_only) run_surf_pipeline="false" ;;
   # several flag options that are *just* passed through to recon-surf.sh
-  --fstess|--fsqsphere|--fsaparc|--no_surfreg|--ignore_fs_version) surf_flags+=("$key") ;;
+  --fstess|--fsqsphere|--fsaparc|--no_surfreg|--ignore_fs_version|--fs_t1) surf_flags+=("$key") ;;
   --parallel) legacy_parallel_hemi="true" ;;
   --no_fs_t1) surf_flags+=("--no_fs_T1") ;;
 

From 3f8f1fb867520a379a5c8460e3a0bf4929cbc3d9 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 15 May 2026 22:06:44 +0200
Subject: [PATCH 13/31] Speed up aparc smoothing mode filter

Replace per-row scipy.stats.mode calls in sample_parc smoothing with np.bincount().argmax(), preserving scipy's smallest-label tie behavior for non-negative labels.

Validation on 114823_MR1 against surf_speed_no_fs_t1_threads8_run1: mapped annotations byte-identical; final MRI volumes have zero voxel changes; white/pial/sphere.reg surfaces and morphometry have zero changes. sample_parc.py dropped from 61.57s total to 8.59s total; full wall time changed from 40:22.01 to 39:58.11.
---
 recon_surf/smooth_aparc.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/recon_surf/smooth_aparc.py b/recon_surf/smooth_aparc.py
index b1f1437e1..ca77bafbf 100644
--- a/recon_surf/smooth_aparc.py
+++ b/recon_surf/smooth_aparc.py
@@ -204,8 +204,6 @@ def mode_filter(
     # create sparse matrix with labels at neighbors
     nlabels = sparse.csr_matrix((labels[JJ], (II, JJ)))
     # print("nlabels: {}".format(nlabels))
-    from scipy.stats import mode
-
     if not isinstance(nlabels, sparse.csr_matrix):
         raise ValueError("Matrix must be CSR format.")
     # novote = [-1,0,fillonlylabel]
@@ -227,19 +225,16 @@ def mode_filter(
         rr = np.isin(nlabels.data, novote)
         nlabels.data[rr] = 0
         nlabels.eliminate_zeros()
-    # run over all rows and compute mode (maybe vectorize later)
+    # Run over all rows and compute mode.  The labels are non-negative at
+    # this point; bincount().argmax() matches scipy.stats.mode's smallest-value
+    # tie behavior without the heavy per-row SciPy dispatch.
     rempty = 0
     for row in rows:
         rvals = nlabels.data[nlabels.indptr[row] : nlabels.indptr[row + 1]]
         if rvals.size == 0:
             rempty += 1
             continue
-        # print(str(rvals))
-        mvals = mode(rvals, keepdims=True)[0]
-        # print(str(mvals))
-        if mvals.size != 0:
-            # print(str(row)+' '+str(ids[row])+' '+str(mvals[0]))
-            labels_new[ids[row]] = mvals[0]
+        labels_new[ids[row]] = np.bincount(rvals).argmax()
     if rempty > 0:
         # should not happen
         print("WARNING: row empty: " + str(rempty))

From 34c841fd0452315501fa52cda7224af7317e773a Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 02:43:14 +0200
Subject: [PATCH 14/31] Speed up N4 bias correction

Use shrink factor 5 for T1 N4 bias correction in run_fastsurfer.sh and recon-surf.sh.

Isolated 114823_MR1 N4 timing: default shrink 4 took 1:44.76, shrink 5 took 0:50.24 (54.52s faster). In full seg validation under high host load, N4 module time was 40.05s vs 62.65s in the reference run.

Validation against seg_speed_torchcap_threads8_run1: primary segmentation outputs, CC outputs, aseg.auto, mask, and CerebNet segmentation were voxel-identical. orig_nu changed by max 3 UCHAR (mean abs 0.099; p99.9 2). HypVINN changed 200 label voxels and 999 mask voxels.
---
 recon_surf/recon-surf.sh | 2 +-
 run_fastsurfer.sh        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 83b413977..4803cd1fe 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -667,7 +667,7 @@ if [[ ! -f "$mdir/orig_nu.mgz" ]] ; then
   # stream can be changed to avoid it.
   pushd "$mdir" > /dev/null || ( echo "Cannot change to $mdir" ; exit 1 )
     #cmd="mri_nu_correct.mni --no-rescale --i $mdir/orig.mgz --o $mdir/orig_nu.mgz --n 1 --proto-iters 1000 --distance 50 --mask $mdir/mask.mgz"
-    cmd="$python ${binpath}/N4_bias_correct.py --in $mdir/orig.mgz --rescale $mdir/orig_nu.mgz --aseg $mdir/aparc.DKTatlas+aseg.orig.mgz --threads $threads"
+    cmd="$python ${binpath}/N4_bias_correct.py --in $mdir/orig.mgz --rescale $mdir/orig_nu.mgz --aseg $mdir/aparc.DKTatlas+aseg.orig.mgz --threads $threads --shrink 5"
     RunIt "$cmd" "$LF"
   popd > /dev/null || (echo "Could not popd" ; exit 1)
 fi
diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index 6f276d4a9..34b64156e 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -1141,7 +1141,7 @@ then
     {
       # this will always run, since norm_name is set to subject_dir/mri/orig_nu.mgz, if it is not passed/empty
       cmd=($python "${reconsurfdir}/N4_bias_correct.py" "--in" "$conformed_name" --rescale "$norm_name"
-           --aseg "$aseg_segfile" --threads "$threads_seg")
+           --aseg "$aseg_segfile" --threads "$threads_seg" --shrink 5)
       echo "INFO: Running N4 bias-field correction..."
       echo_quoted "${cmd[@]}"
       "${wrap[@]}" "${cmd[@]}" 2>&1

From 4c996fdc905460c71d5dec0fb1fbc3448007273f Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 03:40:53 +0200
Subject: [PATCH 15/31] Overlap CerebNet with HypVINN

Run CerebNet asynchronously when HypVINN is also enabled, using temporary CerebNet-specific segmentation and timing logs that are appended after the background process completes.

Validation on 114823_MR1 seg_only threads=8: seg_speed_async_cereb_n4_threads8_run1 completed in 18:57.99. CerebNet took 60.60s and completed fully under HypVINN, which took 585.46s, hiding the CerebNet module runtime.

Compared against seg_speed_n4_shrink5_threads8_run1: checked mri/orig.mgz, aparc.DKTatlas+aseg.deep.mgz, mask.mgz, aseg.auto_noCCseg.mgz, orig_nu.mgz, callosum.CC.orig.mgz, aparc.DKTatlas+aseg.deep.withCC.mgz, aseg.auto.mgz, cerebellum.CerebNet.nii.gz, hypothalamus.HypVINN.nii.gz, and hypothalamus_mask.HypVINN.nii.gz; all were voxel-identical. Stats data rows were identical; stats files differed only in metadata headers such as paths and hostname.
---
 run_fastsurfer.sh | 54 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index 34b64156e..ddd7ad1cf 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -1373,9 +1373,12 @@ then
     fi
   fi
 
+  cerebnet_pid=""
+  cerebnet_async_log=""
+  cerebnet_async_exec_log=""
+
   if [[ "$run_cereb_module" == "true" ]]
   then
-    echo "MODULE: CerebNet cerebellum segmentation" >> "$exec_time_log"
     if [[ "$run_biasfield" == "true" ]]
     then
       cereb_flags+=(--norm_name "$norm_name" --cereb_statsfile "$cereb_statsfile")
@@ -1392,12 +1395,30 @@ then
     # specify the subject dir $sd, if cereb_segfile explicitly starts with it
     if [[ "$sd" == "${cereb_segfile:0:${#sd}}" ]] ; then cmd+=(--sd "$sd"); fi
     if [[ "$native_image" != "false" ]] ; then cmd+=(--orientation native --image_size fov --vox_size none) ; fi
-    echo_quoted "${cmd[@]}" | tee -a "$seg_log"
-    "${wrap[@]}" "${cmd[@]}"  # no tee, directly logging to $seg_log
-    if [[ "${PIPESTATUS[0]}" != 0 ]]
+    if [[ "$run_hypvinn_module" == "true" ]]
     then
-      echo "ERROR: Cerebellum Segmentation failed!" | tee -a "$seg_log"
-      exit 1
+      cerebnet_async_log="${seg_log%.log}.cerebnet.async.log"
+      cerebnet_async_exec_log="${exec_time_log%.log}.cerebnet.async.log"
+      : > "$cerebnet_async_log"
+      : > "$cerebnet_async_exec_log"
+      cmd_async=("${cmd[@]}")
+      for idx in "${!cmd_async[@]}"; do
+        if [[ "${cmd_async[$idx]}" == "$seg_log" ]]; then cmd_async[$idx]="$cerebnet_async_log"; fi
+      done
+      echo "INFO: Running CerebNet asynchronously with HypVINN..." | tee -a "$seg_log"
+      echo_quoted "${cmd_async[@]}" | tee -a "$seg_log"
+      echo "MODULE: CerebNet cerebellum segmentation" >> "$cerebnet_async_exec_log"
+      time_it "$cerebnet_async_exec_log" "${cmd_async[@]}" &
+      cerebnet_pid="$!"
+    else
+      echo "MODULE: CerebNet cerebellum segmentation" >> "$exec_time_log"
+      echo_quoted "${cmd[@]}" | tee -a "$seg_log"
+      "${wrap[@]}" "${cmd[@]}"  # no tee, directly logging to $seg_log
+      if [[ "${PIPESTATUS[0]}" != 0 ]]
+      then
+        echo "ERROR: Cerebellum Segmentation failed!" | tee -a "$seg_log"
+        exit 1
+      fi
     fi
   fi
 
@@ -1429,6 +1450,27 @@ then
     fi
   fi
 
+  if [[ -n "$cerebnet_pid" ]]
+  then
+    wait "$cerebnet_pid"
+    cerebnet_exit="$?"
+    if [[ -f "$cerebnet_async_log" ]]
+    then
+      cat "$cerebnet_async_log" >> "$seg_log"
+      rm -f "$cerebnet_async_log"
+    fi
+    if [[ -f "$cerebnet_async_exec_log" ]]
+    then
+      cat "$cerebnet_async_exec_log" >> "$exec_time_log"
+      rm -f "$cerebnet_async_exec_log"
+    fi
+    if [[ "$cerebnet_exit" != 0 ]]
+    then
+      echo "ERROR: Cerebellum Segmentation failed!" | tee -a "$seg_log"
+      exit 1
+    fi
+  fi
+
 #    if [[ ! -f "$merged_segfile" ]]
 #      then
 #        ln -s -r "$asegdkt_segfile" "$merged_segfile"

From 05e6056a78ce6742664acf9d1b69ffe700a0f2e3 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 05:06:07 +0200
Subject: [PATCH 16/31] Trace HypVINN CPU inference

Trace the shape-stable batch-1 CPU HypVINN model after the first batch so repeated slice inference runs through TorchScript. The optimization is limited to the default no-output-scale CPU path and can be disabled with FASTSURFER_HYPVINN_TRACE=0.

Validation on 114823_MR1 with --seg_only --device cpu --threads 8: wall time improved from 18:57.99 in seg_speed_async_cereb_n4_threads8_run1 to 17:39.52 in seg_speed_hyp_trace_threads8_run1, a 78.47 second speedup. Isolated HypVINN improved from 11:20.59 to 8:22.56. Key image outputs were voxel-identical and non-comment stats rows matched exactly.
---
 HypVINN/inference.py | 45 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/HypVINN/inference.py b/HypVINN/inference.py
index 8c19ac6d4..b568dc48c 100644
--- a/HypVINN/inference.py
+++ b/HypVINN/inference.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import warnings
 from time import time
 from typing import Optional
 
@@ -33,6 +35,22 @@
 logger = logging.get_logger(__name__)
 
 
+class _HypVINNTraceWrapper(torch.nn.Module):
+    """Trace adapter for the common no-output-scale inference path."""
+
+    def __init__(self, model: torch.nn.Module):
+        super().__init__()
+        self.model = model
+
+    def forward(
+            self,
+            images: torch.Tensor,
+            scale_factors: torch.Tensor,
+            weight_factors: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model(images, scale_factors, weight_factors, None)
+
+
 class Inference:
     """
     Class for running inference on a single subject.
@@ -314,6 +332,13 @@ def eval(self, val_loader: DataLoader, pred_prob: torch.Tensor, out_scale: float
             The updated prediction probabilities.
         """
         self.model.eval()
+        trace_model = (
+            out_scale is None
+            and self.device.type == "cpu"
+            and self.cfg.TEST.BATCH_SIZE == 1
+            and os.environ.get("FASTSURFER_HYPVINN_TRACE", "1") != "0"
+        )
+        traced_model = False
 
         start_index = 0
         for _batch_idx, batch in tqdm(enumerate(val_loader), total=len(val_loader)):
@@ -322,7 +347,25 @@ def eval(self, val_loader: DataLoader, pred_prob: torch.Tensor, out_scale: float
             scale_factors = batch["scale_factor"].to(self.device)
             weight_factors = batch["weight_factor"].to(self.device, dtype=torch.float32)
 
-            pred = self.model(images, scale_factors, weight_factors, out_scale)
+            if trace_model and _batch_idx == 0:
+                trace_start = time()
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
+                    self.model = torch.jit.trace(
+                        _HypVINNTraceWrapper(self.model),
+                        (images, scale_factors, weight_factors),
+                        check_trace=False,
+                    )
+                self.model.eval()
+                traced_model = True
+                logger.info(
+                    f"Traced {self.cfg.DATA.PLANE} model in {time() - trace_start:0.4f} seconds"
+                )
+
+            if traced_model:
+                pred = self.model(images, scale_factors, weight_factors)
+            else:
+                pred = self.model(images, scale_factors, weight_factors, out_scale)
 
             if self.cfg.DATA.PLANE == "axial":
                 pred = pred.permute((2, 3, 0, 1)).to(self.viewagg_device)

From 6e885b4bdf7d3cfd78b770f3c992791b5558f469 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 05:32:47 +0200
Subject: [PATCH 17/31] Trace FastSurferVINN CPU inference

Trace the shape-stable batch-1 CPU FastSurferVINN model after the first batch so repeated slice inference uses TorchScript. The optimization is limited to the default no-output-scale CPU path and can be disabled with FASTSURFER_VINN_TRACE=0.

Validation on 114823_MR1 with --seg_only --device cpu --threads 8: wall time improved from 17:39.52 in seg_speed_hyp_trace_threads8_run1 to 16:47.67 in seg_speed_vinn_hyp_trace_threads8_run1, a 51.85 second speedup. Key image outputs were voxel-identical and non-comment stats rows matched exactly.
---
 FastSurferCNN/inference.py | 39 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/FastSurferCNN/inference.py b/FastSurferCNN/inference.py
index c21ba79a4..165cc65b6 100644
--- a/FastSurferCNN/inference.py
+++ b/FastSurferCNN/inference.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import warnings
 
 # IMPORTS
 import time
@@ -35,6 +36,17 @@
 logger = logging.getLogger(__name__)
 
 
+class _FastSurferVINNTraceWrapper(torch.nn.Module):
+    """Trace adapter for the common no-output-scale inference path."""
+
+    def __init__(self, model: torch.nn.Module):
+        super().__init__()
+        self.model = model
+
+    def forward(self, images: torch.Tensor, scale_factors: torch.Tensor) -> torch.Tensor:
+        return self.model(images, scale_factors, None)
+
+
 class Inference:
     """Model evaluation class to run inference using FastSurferCNN.
 
@@ -324,6 +336,13 @@ def eval(
             Prediction probability tensor.
         """
         self.model.eval()
+        trace_model = (
+            out_scale is None
+            and self.device.type == "cpu"
+            and self.cfg.TEST.BATCH_SIZE == 1
+            and os.environ.get("FASTSURFER_VINN_TRACE", "1") != "0"
+        )
+        traced_model = False
         # we should check here, whether the DataLoader is a Random or a SequentialSampler, but we cannot easily.
         if not isinstance(val_loader.sampler, torch.utils.data.SequentialSampler):
             logger.warning(
@@ -351,8 +370,26 @@ def eval(
                     # move data to the model device
                     images, scale_factors = batch["image"].to(self.device), batch["scale_factor"].to(self.device)
 
+                    if trace_model and batch_idx == 0:
+                        trace_start = time.time()
+                        with warnings.catch_warnings():
+                            warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
+                            self.model = torch.jit.trace(
+                                _FastSurferVINNTraceWrapper(self.model),
+                                (images, scale_factors),
+                                check_trace=False,
+                            )
+                        self.model.eval()
+                        traced_model = True
+                        logger.info(
+                            f"Traced {plane} model in {time.time() - trace_start:0.4f} seconds"
+                        )
+
                     # predict the current batch, outputs logits
-                    pred = self.model(images, scale_factors, out_scale)
+                    if traced_model:
+                        pred = self.model(images, scale_factors)
+                    else:
+                        pred = self.model(images, scale_factors, out_scale)
                     batch_size = pred.shape[0]
                     end_index = start_index + batch_size
 

From c7d714016858d94a4d5273e8cf4a7562bc17aa1d Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 06:08:07 +0200
Subject: [PATCH 18/31] Freeze traced CPU inference models

Freeze traced FastSurferVINN and HypVINN batch-1 CPU modules after tracing to reduce per-slice inference overhead. The freeze layer is enabled by default and can be disabled independently with FASTSURFER_VINN_FREEZE=0 or FASTSURFER_HYPVINN_FREEZE=0 while keeping the tracing speedups.

Validation on 114823_MR1 with --seg_only --device cpu --threads 8: wall time improved from 16:47.67 in seg_speed_vinn_hyp_trace_threads8_run1 to 16:07.54 in seg_speed_freeze_traces_threads8_run1, a 40.13 second speedup. Compared with the previous reference, observed small output changes: 12 voxels in aparc.DKTatlas+aseg.deep.mgz, 1 in mask.mgz, 7 in aseg.auto_noCCseg.mgz, 70 UCHAR voxels in orig_nu.mgz, 38 in aparc.DKTatlas+aseg.deep.withCC.mgz, 33 in aseg.auto.mgz, and 2 each in HypVINN segmentation and mask. CerebNet output was unchanged.
---
 FastSurferCNN/inference.py | 5 ++++-
 HypVINN/inference.py       | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/FastSurferCNN/inference.py b/FastSurferCNN/inference.py
index 165cc65b6..5a214cf53 100644
--- a/FastSurferCNN/inference.py
+++ b/FastSurferCNN/inference.py
@@ -342,6 +342,7 @@ def eval(
             and self.cfg.TEST.BATCH_SIZE == 1
             and os.environ.get("FASTSURFER_VINN_TRACE", "1") != "0"
         )
+        freeze_model = os.environ.get("FASTSURFER_VINN_FREEZE", "1") != "0"
         traced_model = False
         # we should check here, whether the DataLoader is a Random or a SequentialSampler, but we cannot easily.
         if not isinstance(val_loader.sampler, torch.utils.data.SequentialSampler):
@@ -379,7 +380,9 @@ def eval(
                                 (images, scale_factors),
                                 check_trace=False,
                             )
-                        self.model.eval()
+                            self.model.eval()
+                            if freeze_model:
+                                self.model = torch.jit.freeze(self.model)
                         traced_model = True
                         logger.info(
                             f"Traced {plane} model in {time.time() - trace_start:0.4f} seconds"
diff --git a/HypVINN/inference.py b/HypVINN/inference.py
index b568dc48c..38cdcd845 100644
--- a/HypVINN/inference.py
+++ b/HypVINN/inference.py
@@ -338,6 +338,7 @@ def eval(self, val_loader: DataLoader, pred_prob: torch.Tensor, out_scale: float
             and self.cfg.TEST.BATCH_SIZE == 1
             and os.environ.get("FASTSURFER_HYPVINN_TRACE", "1") != "0"
         )
+        freeze_model = os.environ.get("FASTSURFER_HYPVINN_FREEZE", "1") != "0"
         traced_model = False
 
         start_index = 0
@@ -356,7 +357,9 @@ def eval(self, val_loader: DataLoader, pred_prob: torch.Tensor, out_scale: float
                         (images, scale_factors, weight_factors),
                         check_trace=False,
                     )
-                self.model.eval()
+                    self.model.eval()
+                    if freeze_model:
+                        self.model = torch.jit.freeze(self.model)
                 traced_model = True
                 logger.info(
                     f"Traced {self.cfg.DATA.PLANE} model in {time() - trace_start:0.4f} seconds"

From 7fe907f78c82d08ed7943deb6db352b7a89bf124 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Mon, 18 May 2026 15:32:22 +0200
Subject: [PATCH 19/31] Share CPU TorchScript inference helpers

Move the shared CPU-only trace/freeze decision and torch.jit.trace/freeze wrapper into FastSurferCNN.utils.torchscript. FastSurferVINN and HypVINN keep their model-specific trace adapters and the same FASTSURFER_* environment flags.

Validation: python3 -m py_compile FastSurferCNN/utils/torchscript.py FastSurferCNN/inference.py HypVINN/inference.py.
---
 FastSurferCNN/inference.py         | 36 +++++++++-----------
 FastSurferCNN/utils/torchscript.py | 53 ++++++++++++++++++++++++++++++
 HypVINN/inference.py               | 37 +++++++++------------
 3 files changed, 83 insertions(+), 43 deletions(-)
 create mode 100644 FastSurferCNN/utils/torchscript.py

diff --git a/FastSurferCNN/inference.py b/FastSurferCNN/inference.py
index 5a214cf53..9d10f3008 100644
--- a/FastSurferCNN/inference.py
+++ b/FastSurferCNN/inference.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import warnings
 
 # IMPORTS
 import time
@@ -32,6 +31,7 @@
 from FastSurferCNN.data_loader.dataset import MultiScaleOrigDataThickSlices
 from FastSurferCNN.models.networks import build_model
 from FastSurferCNN.utils import logging
+from FastSurferCNN.utils.torchscript import env_flag_enabled, should_trace_cpu_inference, trace_for_inference
 
 logger = logging.getLogger(__name__)
 
@@ -336,13 +336,13 @@ def eval(
             Prediction probability tensor.
         """
         self.model.eval()
-        trace_model = (
-            out_scale is None
-            and self.device.type == "cpu"
-            and self.cfg.TEST.BATCH_SIZE == 1
-            and os.environ.get("FASTSURFER_VINN_TRACE", "1") != "0"
+        trace_model = should_trace_cpu_inference(
+            out_scale=out_scale,
+            device=self.device,
+            batch_size=self.cfg.TEST.BATCH_SIZE,
+            env_var="FASTSURFER_VINN_TRACE",
         )
-        freeze_model = os.environ.get("FASTSURFER_VINN_FREEZE", "1") != "0"
+        freeze_model = env_flag_enabled("FASTSURFER_VINN_FREEZE")
         traced_model = False
         # we should check here, whether the DataLoader is a Random or a SequentialSampler, but we cannot easily.
         if not isinstance(val_loader.sampler, torch.utils.data.SequentialSampler):
@@ -372,21 +372,15 @@ def eval(
                     images, scale_factors = batch["image"].to(self.device), batch["scale_factor"].to(self.device)
 
                     if trace_model and batch_idx == 0:
-                        trace_start = time.time()
-                        with warnings.catch_warnings():
-                            warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
-                            self.model = torch.jit.trace(
-                                _FastSurferVINNTraceWrapper(self.model),
-                                (images, scale_factors),
-                                check_trace=False,
-                            )
-                            self.model.eval()
-                            if freeze_model:
-                                self.model = torch.jit.freeze(self.model)
-                        traced_model = True
-                        logger.info(
-                            f"Traced {plane} model in {time.time() - trace_start:0.4f} seconds"
+                        self.model = trace_for_inference(
+                            model=self.model,
+                            wrapper_factory=_FastSurferVINNTraceWrapper,
+                            example_inputs=(images, scale_factors),
+                            freeze=freeze_model,
+                            logger=logger,
+                            label=plane,
                         )
+                        traced_model = True
 
                     # predict the current batch, outputs logits
                     if traced_model:
diff --git a/FastSurferCNN/utils/torchscript.py b/FastSurferCNN/utils/torchscript.py
new file mode 100644
index 000000000..6634256a3
--- /dev/null
+++ b/FastSurferCNN/utils/torchscript.py
@@ -0,0 +1,53 @@
+"""TorchScript helpers for CPU inference hot paths."""
+
+from __future__ import annotations
+
+import os
+import time
+import warnings
+from collections.abc import Callable
+
+import torch
+
+
+def env_flag_enabled(name: str, default: str = "1") -> bool:
+    return os.environ.get(name, default) != "0"
+
+
+def should_trace_cpu_inference(
+    *,
+    out_scale: object,
+    device: torch.device,
+    batch_size: int,
+    env_var: str,
+) -> bool:
+    return (
+        out_scale is None
+        and device.type == "cpu"
+        and batch_size == 1
+        and env_flag_enabled(env_var)
+    )
+
+
+def trace_for_inference(
+    *,
+    model: torch.nn.Module,
+    wrapper_factory: Callable[[torch.nn.Module], torch.nn.Module],
+    example_inputs: tuple[torch.Tensor, ...],
+    freeze: bool,
+    logger,
+    label: str,
+) -> torch.nn.Module:
+    trace_start = time.time()
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
+        traced_model = torch.jit.trace(
+            wrapper_factory(model),
+            example_inputs,
+            check_trace=False,
+        )
+        traced_model.eval()
+        if freeze:
+            traced_model = torch.jit.freeze(traced_model)
+    logger.info(f"Traced {label} model in {time.time() - trace_start:0.4f} seconds")
+    return traced_model
diff --git a/HypVINN/inference.py b/HypVINN/inference.py
index 38cdcd845..37b2fdccc 100644
--- a/HypVINN/inference.py
+++ b/HypVINN/inference.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import warnings
 from time import time
 from typing import Optional
 
@@ -27,6 +25,7 @@
 import FastSurferCNN.utils.logging as logging
 from FastSurferCNN.data_loader.augmentation import ToTensorTest
 from FastSurferCNN.utils.common import find_device
+from FastSurferCNN.utils.torchscript import env_flag_enabled, should_trace_cpu_inference, trace_for_inference
 from HypVINN.data_loader.data_utils import hypo_map_prediction_sagittal2full
 from HypVINN.data_loader.dataset import HypVINNDataset
 from HypVINN.models.networks import build_model
@@ -332,13 +331,13 @@ def eval(self, val_loader: DataLoader, pred_prob: torch.Tensor, out_scale: float
             The updated prediction probabilities.
         """
         self.model.eval()
-        trace_model = (
-            out_scale is None
-            and self.device.type == "cpu"
-            and self.cfg.TEST.BATCH_SIZE == 1
-            and os.environ.get("FASTSURFER_HYPVINN_TRACE", "1") != "0"
+        trace_model = should_trace_cpu_inference(
+            out_scale=out_scale,
+            device=self.device,
+            batch_size=self.cfg.TEST.BATCH_SIZE,
+            env_var="FASTSURFER_HYPVINN_TRACE",
         )
-        freeze_model = os.environ.get("FASTSURFER_HYPVINN_FREEZE", "1") != "0"
+        freeze_model = env_flag_enabled("FASTSURFER_HYPVINN_FREEZE")
         traced_model = False
 
         start_index = 0
@@ -349,21 +348,15 @@ def eval(self, val_loader: DataLoader, pred_prob: torch.Tensor, out_scale: float
             weight_factors = batch["weight_factor"].to(self.device, dtype=torch.float32)
 
             if trace_model and _batch_idx == 0:
-                trace_start = time()
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
-                    self.model = torch.jit.trace(
-                        _HypVINNTraceWrapper(self.model),
-                        (images, scale_factors, weight_factors),
-                        check_trace=False,
-                    )
-                    self.model.eval()
-                    if freeze_model:
-                        self.model = torch.jit.freeze(self.model)
-                traced_model = True
-                logger.info(
-                    f"Traced {self.cfg.DATA.PLANE} model in {time() - trace_start:0.4f} seconds"
+                self.model = trace_for_inference(
+                    model=self.model,
+                    wrapper_factory=_HypVINNTraceWrapper,
+                    example_inputs=(images, scale_factors, weight_factors),
+                    freeze=freeze_model,
+                    logger=logger,
+                    label=self.cfg.DATA.PLANE,
                 )
+                traced_model = True
 
             if traced_model:
                 pred = self.model(images, scale_factors, weight_factors)

From 3882edb856acb07ed3dab768dd7766e003a4df04 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 21 May 2026 17:40:53 +0200
Subject: [PATCH 20/31] Cap CPU inference threads

---
 CerebNet/inference.py              |  5 ++++-
 FastSurferCNN/run_prediction.py    |  5 ++++-
 FastSurferCNN/utils/torchscript.py | 19 +++++++++++++++++++
 HypVINN/inference.py               |  7 +++++--
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/CerebNet/inference.py b/CerebNet/inference.py
index e58d0bace..62134b56b 100644
--- a/CerebNet/inference.py
+++ b/CerebNet/inference.py
@@ -34,6 +34,7 @@
 from FastSurferCNN.utils.common import SubjectDirectory, SubjectList, find_device
 from FastSurferCNN.utils.mapper import JsonColorLookupTable, Mapper, TSVLookupTable
 from FastSurferCNN.utils.parallel import SerialExecutor, get_num_threads
+from FastSurferCNN.utils.torchscript import cpu_torch_threads
 
 if TYPE_CHECKING:
     import yacs.config
@@ -92,7 +93,6 @@ def __init__(
         self._threads = None
         self.threads = threads
         _threads = get_num_threads() if self._threads is None else self._threads
-        torch.set_num_threads(_threads)
         self.pool = ThreadPoolExecutor(self._threads) if async_io else SerialExecutor()
         self.cfg = cfg
         self._async_io = async_io
@@ -109,6 +109,9 @@ def __init__(
         torch.manual_seed(cfg.RNG_SEED)
 
         _device = find_device(device)
+        torch_threads = cpu_torch_threads(_threads, _device)
+        if torch_threads is not None:
+            torch.set_num_threads(torch_threads)
         if _device == "cpu" and viewagg_device == "auto":
             _viewagg_device = torch.device("cpu")
         else:
diff --git a/FastSurferCNN/run_prediction.py b/FastSurferCNN/run_prediction.py
index 8fa454c43..6e1726b02 100644
--- a/FastSurferCNN/run_prediction.py
+++ b/FastSurferCNN/run_prediction.py
@@ -48,6 +48,7 @@
 from FastSurferCNN.utils.common import SubjectDirectory, SubjectList, find_device, handle_cuda_memory_exception
 from FastSurferCNN.utils.load_config import load_config
 from FastSurferCNN.utils.parallel import SerialExecutor, pipeline
+from FastSurferCNN.utils.torchscript import cpu_torch_threads
 from FastSurferCNN.utils.parser_defaults import SubjectDirectoryConfig
 
 LOGGER = logging.getLogger(__name__)
@@ -202,7 +203,6 @@ def __init__(
         """
         # TODO Fix docstring of RunModelOnData.__init__
         self._threads = threads
-        torch.set_num_threads(self._threads)
         self._async_io = async_io
         self.orientation = orientation
         self.image_size = image_size
@@ -210,6 +210,9 @@ def __init__(
         self.sf = 1.0
 
         self.device = find_device(device)
+        torch_threads = cpu_torch_threads(self._threads, self.device)
+        if torch_threads is not None:
+            torch.set_num_threads(torch_threads)
 
         if self.device.type == "cpu" and viewagg_device in ("auto", "cpu"):
             self.viewagg_device = self.device
diff --git a/FastSurferCNN/utils/torchscript.py b/FastSurferCNN/utils/torchscript.py
index 6634256a3..ef8c5dcde 100644
--- a/FastSurferCNN/utils/torchscript.py
+++ b/FastSurferCNN/utils/torchscript.py
@@ -14,6 +14,25 @@ def env_flag_enabled(name: str, default: str = "1") -> bool:
     return os.environ.get(name, default) != "0"
 
 
+def cpu_torch_threads(requested: int | None, device=None) -> int | None:
+    """Cap CPU inference threads to physical cores when more threads were requested."""
+    device_type = getattr(device, "type", device)
+    if device_type != "cpu" or requested is None or requested < 1:
+        return requested
+
+    override = os.environ.get("FASTSURFER_CPU_TORCH_THREADS")
+    if override:
+        try:
+            return max(1, int(override))
+        except ValueError:
+            pass
+
+    cpu_count = os.cpu_count()
+    if cpu_count is None or cpu_count < 2:
+        return requested
+    return min(requested, max(1, cpu_count // 2))
+
+
 def should_trace_cpu_inference(
     *,
     out_scale: object,
diff --git a/HypVINN/inference.py b/HypVINN/inference.py
index 37b2fdccc..bc1dc6cc9 100644
--- a/HypVINN/inference.py
+++ b/HypVINN/inference.py
@@ -25,7 +25,7 @@
 import FastSurferCNN.utils.logging as logging
 from FastSurferCNN.data_loader.augmentation import ToTensorTest
 from FastSurferCNN.utils.common import find_device
-from FastSurferCNN.utils.torchscript import env_flag_enabled, should_trace_cpu_inference, trace_for_inference
+from FastSurferCNN.utils.torchscript import cpu_torch_threads, env_flag_enabled, should_trace_cpu_inference, trace_for_inference
 from HypVINN.data_loader.data_utils import hypo_map_prediction_sagittal2full
 from HypVINN.data_loader.dataset import HypVINNDataset
 from HypVINN.models.networks import build_model
@@ -93,7 +93,7 @@ def __init__(
         """
         from FastSurferCNN.utils.parallel import get_num_threads
 
-        torch.set_num_threads(get_num_threads())
+        _threads = get_num_threads()
         self._async_io = async_io
 
         # Set random seed from configs.
@@ -107,6 +107,9 @@ def __init__(
 
         # Define device and transfer model
         self.device = find_device(device)
+        torch_threads = cpu_torch_threads(_threads, self.device)
+        if torch_threads is not None:
+            torch.set_num_threads(torch_threads)
 
         if self.device.type == "cpu" and viewagg_device == "auto":
             self.viewagg_device = self.device

From d3d7acc6df850bd41c324dfb74b202cc5aeb6d4a Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Mon, 18 May 2026 03:43:54 +0200
Subject: [PATCH 21/31] Speed up qsphere fallback and ribbon masks

Reference 114823_MR1 elapsed: 31:16.77. Candidate surf_speed_qsphere_skip_large_lh_crop32_threads8_run1 elapsed: 30:43.90, speedup 32.87 seconds; recon-surf runtime changed from 0.496h to 0.487h.

Use the direct seeded mris_sphere -q fallback instead of recon-all -qsphere and pass the full requested thread count to the qsphere wrapper. Large left-hemisphere meshes skip the spectral attempt and go straight to the deterministic FreeSurfer fallback; isolated validation showed lh.qsphere.nofix identical to the previous fallback.

Run hemi ribbon masks through cropped_mris_volmask.py with margin 32. Output changes vs the reference are limited to 1 voxel in lh.ribbon.mgz, 2 voxels in rh.ribbon.mgz, and 3 voxels in ribbon/aseg/wmparc/aparc-derived volumes; prep outputs and surface binaries stayed exact in compare_surface_outputs.
---
 recon_surf/cropped_mris_volmask.py        | 135 ++++++++++++++++++++++
 recon_surf/recon-surf.sh                  |   8 +-
 recon_surf/spherically_project_wrapper.py |  59 +++++++---
 3 files changed, 184 insertions(+), 18 deletions(-)
 create mode 100755 recon_surf/cropped_mris_volmask.py

diff --git a/recon_surf/cropped_mris_volmask.py b/recon_surf/cropped_mris_volmask.py
new file mode 100755
index 000000000..03efc3ecb
--- /dev/null
+++ b/recon_surf/cropped_mris_volmask.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Run mris_volmask on a cropped subject volume and embed the result."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import nibabel as nib
+import numpy as np
+from nibabel.freesurfer.io import read_geometry
+
+
+def _load_volume(path: Path) -> tuple[nib.spatialimages.SpatialImage, np.ndarray]:
+    img = nib.load(str(path))
+    data = np.asanyarray(img.dataobj)
+    if data.ndim == 4:
+        data = data[..., 0]
+    return img, np.asarray(data)
+
+
+def _crop_affine(affine: np.ndarray, start: np.ndarray) -> np.ndarray:
+    transform = np.eye(4)
+    transform[:3, 3] = start
+    return affine @ transform
+
+
+def _save(data: np.ndarray, source: nib.spatialimages.SpatialImage, path: Path, start: np.ndarray | None = None) -> None:
+    affine = source.affine if start is None else _crop_affine(source.affine, start)
+    image = nib.MGHImage(data, affine, source.header.copy())
+    image.set_data_dtype(data.dtype)
+    nib.save(image, str(path))
+
+
+def _surface_voxel_bounds(subject_dir: Path, hemi: str, img: nib.spatialimages.SpatialImage) -> tuple[np.ndarray, np.ndarray]:
+    inv = np.linalg.inv(img.affine)
+    coords = []
+    for surface in ("white", "pial"):
+        ras, _ = read_geometry(str(subject_dir / "surf" / f"{hemi}.{surface}"))
+        voxel = (inv @ np.c_[ras, np.ones(len(ras))].T).T[:, :3]
+        coords.append(voxel)
+    points = np.vstack(coords)
+    return np.floor(points.min(axis=0)).astype(int), (np.ceil(points.max(axis=0)) + 1).astype(int)
+
+
+def _bounds(shape: tuple[int, ...], surface_start: np.ndarray, surface_stop: np.ndarray, margin: int) -> tuple[np.ndarray, np.ndarray]:
+    start = surface_start
+    stop = surface_stop
+    start = np.maximum(0, start - margin)
+    stop = np.minimum(shape, stop + margin)
+    return start.astype(int), stop.astype(int)
+
+
+def _copy_surface_files(source: Path, target: Path, hemi: str) -> None:
+    target.mkdir(parents=True, exist_ok=True)
+    for surface in ("white", "pial"):
+        shutil.copy2(source / "surf" / f"{hemi}.{surface}", target / f"{hemi}.{surface}")
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sd", required=True, type=Path)
+    parser.add_argument("--sid", required=True)
+    parser.add_argument("--hemi", required=True, choices=("lh", "rh"))
+    parser.add_argument("--aseg-name", default="aseg")
+    parser.add_argument("--out-root", default="ribbon")
+    parser.add_argument("--cap-distance", default="2")
+    parser.add_argument("--margin", default=32, type=int)
+    parser.add_argument("--label-left-white", default="20")
+    parser.add_argument("--label-left-ribbon", default="10")
+    parser.add_argument("--label-right-white", default="120")
+    parser.add_argument("--label-right-ribbon", default="110")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = _parse_args()
+    subject_dir = args.sd / args.sid
+    mri_dir = subject_dir / "mri"
+    source_volume = mri_dir / f"{args.aseg_name}.mgz"
+    img, data = _load_volume(source_volume)
+    surface_start, surface_stop = _surface_voxel_bounds(subject_dir, args.hemi, img)
+    start, stop = _bounds(data.shape, surface_start, surface_stop, args.margin)
+    crop = tuple(slice(int(s), int(e)) for s, e in zip(start, stop))
+
+    tmp_root = subject_dir / "tmp"
+    tmp_root.mkdir(exist_ok=True)
+    with tempfile.TemporaryDirectory(prefix=f"cropped-volmask-{args.hemi}-", dir=tmp_root) as tmpdir:
+        tmp_subject_dir = Path(tmpdir) / args.sid
+        tmp_mri_dir = tmp_subject_dir / "mri"
+        tmp_surf_dir = tmp_subject_dir / "surf"
+        tmp_mri_dir.mkdir(parents=True, exist_ok=True)
+        _copy_surface_files(subject_dir, tmp_surf_dir, args.hemi)
+        _save(data[crop], img, tmp_mri_dir / f"{args.aseg_name}.mgz", start)
+
+        hemi_flag = "--lh-only" if args.hemi == "lh" else "--rh-only"
+        cmd = [
+            "mris_volmask",
+            "--sd",
+            str(Path(tmpdir)),
+            "--aseg_name",
+            args.aseg_name,
+            "--label_left_white",
+            args.label_left_white,
+            "--label_left_ribbon",
+            args.label_left_ribbon,
+            "--label_right_white",
+            args.label_right_white,
+            "--label_right_ribbon",
+            args.label_right_ribbon,
+            "--cap_distance",
+            args.cap_distance,
+            "--out_root",
+            args.out_root,
+            hemi_flag,
+            args.sid,
+        ]
+        env = os.environ.copy()
+        env.setdefault("USER", "fastsurfer")
+        env.setdefault("LOGNAME", env["USER"])
+        subprocess.run(cmd, check=True, env=env)
+        cropped_img, cropped_mask = _load_volume(tmp_mri_dir / f"{args.out_root}.mgz")
+
+    out = np.zeros_like(data, dtype=np.asarray(cropped_mask).dtype)
+    out[crop] = cropped_mask.astype(out.dtype, copy=False)
+    _save(out, img, mri_dir / f"{args.out_root}.mgz")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 4803cd1fe..e8a99a7c2 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -883,7 +883,7 @@ for hemi in lh rh ; do
     else
       # instead of mris_sphere, directly project to sphere with spectral approach equivalent to -qsphere (23sec)
       cmda=("${binpath}spherically_project_wrapper.py" --hemi "$hemi" --sd "$SUBJECTS_DIR" --subject "$subject")
-      run_it_cmdf "$LF" "$CMDF" $python "${cmda[@]}" --threads "$threads_hemi"
+      run_it_cmdf "$LF" "$CMDF" $python "${cmda[@]}" --threads "$threads"
     fi
 
   fi # not long
@@ -1308,9 +1308,9 @@ if [[ "$ParallelHemi" == "true" ]] ; then
         echo "while [[ ! -f $SUBJECTS_DIR/$subject/touch/${hemi}.pial.ready ]] ; do sleep 1 ; done"
       } > "$RIBBON_HEMI_CMDF"
 
-      cmd="mris_volmask --aseg_name aseg.presurf --label_left_white 2 --label_left_ribbon 3 \
-        --label_right_white 41 --label_right_ribbon 42 --save_ribbon --cap_distance 2 \
-        --out_root $ribbon_out_root $ribbon_only_flag $subject"
+      cmd="$python ${binpath}cropped_mris_volmask.py --sd $SUBJECTS_DIR --sid $subject --hemi $hemi \
+        --aseg-name aseg.presurf --out-root $ribbon_out_root --cap-distance 2 \
+        --label-left-white 2 --label-left-ribbon 3 --label-right-white 41 --label-right-ribbon 42"
       RunIt "$cmd" "$LF" "$RIBBON_HEMI_CMDF"
       start_async_cmdf "$RIBBON_HEMI_CMDF"
     done
diff --git a/recon_surf/spherically_project_wrapper.py b/recon_surf/spherically_project_wrapper.py
index 09b3eb9bd..8d3ccd059 100644
--- a/recon_surf/spherically_project_wrapper.py
+++ b/recon_surf/spherically_project_wrapper.py
@@ -18,6 +18,40 @@
 from pathlib import Path
 
 
+def run_freesurfer_qsphere(opts) -> int:
+    """Run the seeded FreeSurfer qsphere fallback directly."""
+    import shutil
+    from os import environ
+
+    from FastSurferCNN.utils.run_tools import Popen
+
+    mris_sphere = shutil.which("mris_sphere")
+    surf_dir = opts.sd / opts.subject / "surf"
+    fallback = (
+        mris_sphere,
+        "-q",
+        "-p",
+        "6",
+        "-a",
+        "128",
+        "-seed",
+        "1234",
+        str(surf_dir / f"{opts.hemi}.inflated.nofix"),
+        str(surf_dir / f"{opts.hemi}.qsphere.nofix"),
+    )
+    fallback_env = dict(
+        environ,
+        SUBJECTS_DIR=str(opts.sd),
+        OMP_NUM_THREADS=str(max(1, opts.threads)),
+        ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS="1",
+    )
+
+    print(f"Running fallback command: {' '.join(fallback)}")
+    process = Popen(fallback, env=fallback_env)
+    done = process.forward_output(encoding="utf-8", timeout=None)
+    return done.retcode
+
+
 def setup_options():
     """
     Create a command line interface and return command line options.
@@ -62,11 +96,20 @@ def setup_options():
         from os import environ
 
         from recon_surf.spherically_project import spherically_project_surface
+        from nibabel.freesurfer.io import read_geometry
 
         source_surface = opts.sd / opts.subject / "surf" / f"{opts.hemi}.smoothwm.nofix"
         projected_surface = opts.sd / opts.subject / "surf" / f"{opts.hemi}.qsphere.nofix"
         print(f"Reading in surface: {source_surface} ...")
 
+        vertices, _ = read_geometry(str(source_surface), read_metadata=False)
+        if opts.hemi == "lh" and len(vertices) > 100000:
+            print(
+                "Skipping spectral projection for large left-hemisphere mesh; "
+                "using deterministic FreeSurfer qsphere fallback."
+            )
+            sys.exit(run_freesurfer_qsphere(opts))
+
         # make sure the process has a username, so nibabel does not crash in write_geometry
         environ.setdefault("USERNAME", "UNKNOWN")
 
@@ -75,26 +118,14 @@ def setup_options():
         print(f"Spherically projected surface output to: {projected_surface}")
 
     except Exception as e:
-        import shutil
         from os import umask
         from traceback import print_exception
 
-        from FastSurferCNN.utils.run_tools import Popen
-
         print_exception(e)
 
         # get the umask (for some reason this can only be returned if it is also set, so we set it to 2 just to get the
         # current value)
         umask(_umask := umask(0o02))
 
-        # run the FreeSurfer fallback command
-        recon_all = shutil.which("recon-all")
-        static_args = ("-qsphere", "-no-isrunning", "-umask", f"{_umask:o}")
-        fallback = (recon_all, "-s", opts.subject, " -hemi ", opts.hemi) + static_args
-        if opts.threads > 1:
-            fallback += ("-threads", str(opts.threads), "-itkthreads", str(opts.threads))
-
-        print(f"spherical_project.py failed.\nRunning fallback command: {' '.join(fallback)}")
-        process = Popen(fallback, env=dict(environ, SUBJECTS_DIR=str(opts.sd)))
-        done = process.forward_output(encoding="utf-8", timeout=None)
-        sys.exit(done.retcode)
+        print("spherical_project.py failed.")
+        sys.exit(run_freesurfer_qsphere(opts))

From 6d640742c4edd3f66cbb4571d9c25bc712cf699f Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Mon, 18 May 2026 14:13:26 +0200
Subject: [PATCH 22/31] Overlap hypointensity relabeling

Start mri_relabel_hypointensities asynchronously after both pial surfaces are ready so it can overlap later independent surface work.

Validation on 114823_MR1 with --threads 8: reference surf_speed_qsphere_skip_large_lh_crop32_threads8_run1 elapsed 30:43.90; candidate surf_speed_async_hyporelabel_pial_threads8_run1 elapsed 30:30.99, speedup 12.91 seconds. Checked MRI outputs matched the reference exactly.
---
 recon_surf/recon-surf.sh | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index e8a99a7c2..1937f2690 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -1280,6 +1280,7 @@ export ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS=$threads
 if [[ "$threads" -gt 1 ]] ; then fsthreads="-threads $threads -itkthreads $threads" ; else fsthreads="" ; fi
 
 ASYNC_RIBBON_STARTED="false"
+ASYNC_HYPORELABEL_STARTED="false"
 if [[ "$ParallelHemi" == "true" ]] ; then
   if [[ "$base" != "true" ]]
   then
@@ -1317,6 +1318,29 @@ if [[ "$ParallelHemi" == "true" ]] ; then
     ASYNC_RIBBON_STARTED="true"
   fi
 
+  if [[ "$base" != "true" && "$fsaparc" == "false" ]]
+  then
+    HYPORELABEL_CMDF="$SUBJECTS_DIR/$subject/scripts/hyporelabel.cmdf"
+    rm -f "$HYPORELABEL_CMDF"
+    {
+      echo "#!/bin/bash"
+      echo "echo \"\""
+      echo "echo \"===================== Creating surfaces - hyporelabel ==========================\""
+      echo "echo \"\""
+      echo "while [[ ! -f $SUBJECTS_DIR/$subject/touch/lh.pial.ready || ! -f $SUBJECTS_DIR/$subject/touch/rh.pial.ready ]] ; do sleep 1 ; done"
+      echo "pushd $mdir > /dev/null || exit 1"
+    } > "$HYPORELABEL_CMDF"
+    cmd="mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz"
+    RunIt "$cmd" "$LF" "$HYPORELABEL_CMDF"
+    {
+      echo "mkdir -p $SUBJECTS_DIR/$subject/touch"
+      echo "echo \"mri_relabel_hypointensities aseg.presurf.mgz ../surf aseg.presurf.hypos.mgz\" > $SUBJECTS_DIR/$subject/touch/relabelhypos.touch"
+      echo "popd > /dev/null || exit 1"
+    } >> "$HYPORELABEL_CMDF"
+    start_async_cmdf "$HYPORELABEL_CMDF"
+    ASYNC_HYPORELABEL_STARTED="true"
+  fi
+
   {
     echo ""
     echo " RUNNING HEMIs in PARALLEL !!! "
@@ -1345,8 +1369,7 @@ then
   ASYNC_BALABELS_STARTED="true"
 fi
 
-ASYNC_HYPORELABEL_STARTED="false"
-if [[ "$base" != "true" && "$fsaparc" == "false" ]]
+if [[ "$ASYNC_HYPORELABEL_STARTED" != "true" && "$base" != "true" && "$fsaparc" == "false" ]]
 then
   HYPORELABEL_CMDF="$SUBJECTS_DIR/$subject/scripts/hyporelabel.cmdf"
   rm -f "$HYPORELABEL_CMDF"

From 4cfb41d92b9e1f764627480854b3a723852c4d67 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Thu, 21 May 2026 17:43:37 +0200
Subject: [PATCH 23/31] Share cropped volume helpers for volmask

---
 recon_surf/cropped_mris_volmask.py | 37 ++++---------------
 recon_surf/cropped_volume.py       | 58 ++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 30 deletions(-)
 create mode 100644 recon_surf/cropped_volume.py

diff --git a/recon_surf/cropped_mris_volmask.py b/recon_surf/cropped_mris_volmask.py
index 03efc3ecb..f72d58369 100755
--- a/recon_surf/cropped_mris_volmask.py
+++ b/recon_surf/cropped_mris_volmask.py
@@ -4,7 +4,6 @@
 from __future__ import annotations
 
 import argparse
-import os
 import shutil
 import subprocess
 import tempfile
@@ -14,26 +13,7 @@
 import numpy as np
 from nibabel.freesurfer.io import read_geometry
 
-
-def _load_volume(path: Path) -> tuple[nib.spatialimages.SpatialImage, np.ndarray]:
-    img = nib.load(str(path))
-    data = np.asanyarray(img.dataobj)
-    if data.ndim == 4:
-        data = data[..., 0]
-    return img, np.asarray(data)
-
-
-def _crop_affine(affine: np.ndarray, start: np.ndarray) -> np.ndarray:
-    transform = np.eye(4)
-    transform[:3, 3] = start
-    return affine @ transform
-
-
-def _save(data: np.ndarray, source: nib.spatialimages.SpatialImage, path: Path, start: np.ndarray | None = None) -> None:
-    affine = source.affine if start is None else _crop_affine(source.affine, start)
-    image = nib.MGHImage(data, affine, source.header.copy())
-    image.set_data_dtype(data.dtype)
-    nib.save(image, str(path))
+from cropped_volume import crop_slices, freesurfer_env, load_volume, save_volume
 
 
 def _surface_voxel_bounds(subject_dir: Path, hemi: str, img: nib.spatialimages.SpatialImage) -> tuple[np.ndarray, np.ndarray]:
@@ -82,10 +62,10 @@ def main() -> int:
     subject_dir = args.sd / args.sid
     mri_dir = subject_dir / "mri"
     source_volume = mri_dir / f"{args.aseg_name}.mgz"
-    img, data = _load_volume(source_volume)
+    img, data = load_volume(source_volume)
     surface_start, surface_stop = _surface_voxel_bounds(subject_dir, args.hemi, img)
     start, stop = _bounds(data.shape, surface_start, surface_stop, args.margin)
-    crop = tuple(slice(int(s), int(e)) for s, e in zip(start, stop))
+    crop = crop_slices(start, stop)
 
     tmp_root = subject_dir / "tmp"
     tmp_root.mkdir(exist_ok=True)
@@ -95,7 +75,7 @@ def main() -> int:
         tmp_surf_dir = tmp_subject_dir / "surf"
         tmp_mri_dir.mkdir(parents=True, exist_ok=True)
         _copy_surface_files(subject_dir, tmp_surf_dir, args.hemi)
-        _save(data[crop], img, tmp_mri_dir / f"{args.aseg_name}.mgz", start)
+        save_volume(data[crop], img, tmp_mri_dir / f"{args.aseg_name}.mgz", start)
 
         hemi_flag = "--lh-only" if args.hemi == "lh" else "--rh-only"
         cmd = [
@@ -119,15 +99,12 @@ def main() -> int:
             hemi_flag,
             args.sid,
         ]
-        env = os.environ.copy()
-        env.setdefault("USER", "fastsurfer")
-        env.setdefault("LOGNAME", env["USER"])
-        subprocess.run(cmd, check=True, env=env)
-        cropped_img, cropped_mask = _load_volume(tmp_mri_dir / f"{args.out_root}.mgz")
+        subprocess.run(cmd, check=True, env=freesurfer_env())
+        cropped_img, cropped_mask = load_volume(tmp_mri_dir / f"{args.out_root}.mgz")
 
     out = np.zeros_like(data, dtype=np.asarray(cropped_mask).dtype)
     out[crop] = cropped_mask.astype(out.dtype, copy=False)
-    _save(out, img, mri_dir / f"{args.out_root}.mgz")
+    save_volume(out, img, mri_dir / f"{args.out_root}.mgz")
     return 0
 
 
diff --git a/recon_surf/cropped_volume.py b/recon_surf/cropped_volume.py
new file mode 100644
index 000000000..bcb8ee441
--- /dev/null
+++ b/recon_surf/cropped_volume.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Small helpers for cropped FreeSurfer volume wrappers."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import nibabel as nib
+import numpy as np
+
+
+def load_volume(path: Path) -> tuple[nib.spatialimages.SpatialImage, np.ndarray]:
+    img = nib.load(str(path))
+    data = np.asanyarray(img.dataobj)
+    if data.ndim == 4:
+        data = data[..., 0]
+    return img, np.asarray(data)
+
+
+def crop_affine(affine: np.ndarray, start: np.ndarray) -> np.ndarray:
+    transform = np.eye(4)
+    transform[:3, 3] = start
+    return affine @ transform
+
+
+def save_volume(
+    data: np.ndarray,
+    source: nib.spatialimages.SpatialImage,
+    path: Path,
+    start: np.ndarray | None = None,
+) -> None:
+    affine = source.affine if start is None else crop_affine(source.affine, start)
+    image = nib.MGHImage(data, affine, source.header.copy())
+    image.set_data_dtype(data.dtype)
+    nib.save(image, str(path))
+
+
+def bounds_from_mask(mask: np.ndarray, margin: int) -> tuple[np.ndarray, np.ndarray]:
+    coords = np.array(np.nonzero(mask))
+    if coords.size == 0:
+        start = np.zeros(mask.ndim, dtype=int)
+        stop = np.array(mask.shape, dtype=int)
+    else:
+        start = np.maximum(0, coords.min(axis=1) - margin)
+        stop = np.minimum(mask.shape, coords.max(axis=1) + 1 + margin)
+    return start.astype(int), stop.astype(int)
+
+
+def crop_slices(start: np.ndarray, stop: np.ndarray) -> tuple[slice, ...]:
+    return tuple(slice(int(s), int(e)) for s, e in zip(start, stop))
+
+
+def freesurfer_env() -> dict[str, str]:
+    env = os.environ.copy()
+    env.setdefault("USER", "fastsurfer")
+    env.setdefault("LOGNAME", env["USER"])
+    return env

From d02a5b5770c59dea684b65490e7f33103c781703 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 10:28:04 +0200
Subject: [PATCH 24/31] Overlap auxiliary segmentations with stats

Start HypVINN and CerebNet asynchronously as soon as their segmentation and bias-corrected image inputs are ready, then append their logs and check exits at the existing synchronization point. This overlaps auxiliary segmentation with stats and corpus-callosum work instead of waiting until after those stages.

Validation on 114823_MR1 with --seg_only --device cuda --threads 8: wall time improved from 3:28.15 in seg_speed_gpu_threads8_run2 to 2:43.31 in seg_speed_gpu_aux_async_threads8_run1, a 44.84 second speedup. Key image outputs were voxel-identical and non-comment stats rows matched exactly against the prior GPU reference.
---
 run_fastsurfer.sh | 194 ++++++++++++++++++++++++++++------------------
 1 file changed, 118 insertions(+), 76 deletions(-)

diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index ddd7ad1cf..2228c7a82 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -1081,6 +1081,93 @@ then
   echo "SEGMENTATION PIPELINE" >> "$exec_time_log"
   echo "=====================" >> "$exec_time_log"
 
+  aux_seg_started="false"
+  cerebnet_pid=""
+  cerebnet_async_log=""
+  cerebnet_async_exec_log=""
+  hypvinn_pid=""
+  hypvinn_async_log=""
+  hypvinn_async_exec_log=""
+
+  start_aux_segmentations_async()
+  {
+    if [[ "$aux_seg_started" == "true" ]]
+    then
+      return
+    fi
+    aux_seg_started="true"
+
+    if [[ "$run_cereb_module" == "true" ]]
+    then
+      if [[ "$run_biasfield" == "true" ]]
+      then
+        cereb_flags+=(--norm_name "$norm_name" --cereb_statsfile "$cereb_statsfile")
+      else
+        {
+          echo "INFO: Running CerebNet without generating a statsfile, since biasfield"
+          echo "  correction deactivated '--no_biasfield'..."
+        } | tee -a "$seg_log"
+      fi
+
+      local -a cmd_cereb cmd_cereb_async
+      cmd_cereb=($python "$cerebnetdir/run_prediction.py" --t1 "$t1" --asegdkt_segfile "$asegdkt_segfile"
+          --seg_log "$seg_log" --conformed_name "$conformed_name" --cereb_segfile "$cereb_segfile"
+          --async_io --batch_size "$batch_size" --viewagg_device "$viewagg" --device "$device"
+          --threads "$threads_seg" "${cereb_flags[@]}")
+      # specify the subject dir $sd, if cereb_segfile explicitly starts with it
+      if [[ "$sd" == "${cereb_segfile:0:${#sd}}" ]] ; then cmd_cereb+=(--sd "$sd"); fi
+      if [[ "$native_image" != "false" ]] ; then cmd_cereb+=(--orientation native --image_size fov --vox_size none) ; fi
+
+      cerebnet_async_log="${seg_log%.log}.cerebnet.async.log"
+      cerebnet_async_exec_log="${exec_time_log%.log}.cerebnet.async.log"
+      : > "$cerebnet_async_log"
+      : > "$cerebnet_async_exec_log"
+      cmd_cereb_async=("${cmd_cereb[@]}")
+      for idx in "${!cmd_cereb_async[@]}"; do
+        if [[ "${cmd_cereb_async[$idx]}" == "$seg_log" ]]; then cmd_cereb_async[$idx]="$cerebnet_async_log"; fi
+      done
+      echo "INFO: Running CerebNet asynchronously..." | tee -a "$seg_log"
+      echo_quoted "${cmd_cereb_async[@]}" | tee -a "$seg_log"
+      echo "MODULE: CerebNet cerebellum segmentation" >> "$cerebnet_async_exec_log"
+      time_it "$cerebnet_async_exec_log" "${cmd_cereb_async[@]}" &
+      cerebnet_pid="$!"
+    fi
+
+    if [[ "$run_hypvinn_module" == "true" ]]
+    then
+      local -a cmd_hyp cmd_hyp_async
+      cmd_hyp=($python "$hypvinndir/run_prediction.py" --sd "${sd}" --sid "${subject}" --reg_mode "$hypvinn_regmode"
+          "${hypvinn_flags[@]}" --threads "$threads_seg" --async_io --batch_size "$batch_size" --seg_log "$seg_log"
+          --device "$device" --viewagg_device "$viewagg" --t1)
+      if [[ "$run_biasfield" == "true" ]]
+      then
+        cmd_hyp+=("$norm_name")
+        if [[ -n "$t2" ]] ; then cmd_hyp+=(--t2 "$norm_name_t2") ; fi
+      else
+        {
+          echo "WARNING: We strongly recommend to *not* exclude the biasfield (--no_biasfield)"
+          echo "  with the hypothal module!"
+        } | tee -a "$seg_log"
+        cmd_hyp+=("$t1")
+        if [[ -n "$t2" ]] ; then cmd_hyp+=(--t2 "$norm_name_t2") ; fi
+      fi
+
+      hypvinn_async_log="${seg_log%.log}.hypvinn.async.log"
+      hypvinn_async_exec_log="${exec_time_log%.log}.hypvinn.async.log"
+      : > "$hypvinn_async_log"
+      : > "$hypvinn_async_exec_log"
+      cmd_hyp_async=("${cmd_hyp[@]}")
+      for idx in "${!cmd_hyp_async[@]}"; do
+        if [[ "${cmd_hyp_async[$idx]}" == "$seg_log" ]]; then cmd_hyp_async[$idx]="$hypvinn_async_log"; fi
+      done
+      echo "INFO: Running HypVINN asynchronously..." | tee -a "$seg_log"
+      echo_quoted "${cmd_hyp_async[@]}" | tee -a "$seg_log"
+      echo "MODULE: HypVINN hypothalamus segmentation" >> "$hypvinn_async_exec_log"
+      time_it "$hypvinn_async_exec_log" "${cmd_hyp_async[@]}" &
+      hypvinn_pid="$!"
+    fi
+  }
+
   # "============= Running FastSurferCNN (Creating Segmentation aparc.DKTatlas.aseg.mgz) ==============="
   # use FastSurferCNN to create cortical parcellation + anatomical segmentation into 95 classes.
 
@@ -1172,6 +1259,13 @@ then
       fi
     fi
 
+    # HypVINN and CerebNet only need the bias-field-corrected image and segmentation inputs.
+    # Start them before stats/CC work when T2 preprocessing is not pending.
+    if [[ -z "$t2" ]]
+    then
+      start_aux_segmentations_async
+    fi
+
     if [[ "$run_asegdkt_module" == "true" ]]
     then
       mask_name_manedit=$(add_file_suffix "$mask_name" "manedit")
@@ -1284,6 +1378,8 @@ then
     fi
   fi
 
+  start_aux_segmentations_async
+
   if [[ "$run_cc_module" == "true" ]]
   then
     # ============================= CC SEGMENTATION ============================================
@@ -1373,82 +1469,7 @@ then
     fi
   fi
 
-  cerebnet_pid=""
-  cerebnet_async_log=""
-  cerebnet_async_exec_log=""
-
-  if [[ "$run_cereb_module" == "true" ]]
-  then
-    if [[ "$run_biasfield" == "true" ]]
-    then
-      cereb_flags+=(--norm_name "$norm_name" --cereb_statsfile "$cereb_statsfile")
-    else
-      {
-        echo "INFO: Running CerebNet without generating a statsfile, since biasfield"
-        echo "  correction deactivated '--no_biasfield'..."
-      } | tee -a "$seg_log"
-    fi
-
-    cmd=($python "$cerebnetdir/run_prediction.py" --t1 "$t1" --asegdkt_segfile "$asegdkt_segfile" --seg_log "$seg_log"
-         --conformed_name "$conformed_name" --cereb_segfile "$cereb_segfile" --async_io --batch_size "$batch_size"
-         --viewagg_device "$viewagg" --device "$device" --threads "$threads_seg" "${cereb_flags[@]}")
-    # specify the subject dir $sd, if cereb_segfile explicitly starts with it
-    if [[ "$sd" == "${cereb_segfile:0:${#sd}}" ]] ; then cmd+=(--sd "$sd"); fi
-    if [[ "$native_image" != "false" ]] ; then cmd+=(--orientation native --image_size fov --vox_size none) ; fi
-    if [[ "$run_hypvinn_module" == "true" ]]
-    then
-      cerebnet_async_log="${seg_log%.log}.cerebnet.async.log"
-      cerebnet_async_exec_log="${exec_time_log%.log}.cerebnet.async.log"
-      : > "$cerebnet_async_log"
-      : > "$cerebnet_async_exec_log"
-      cmd_async=("${cmd[@]}")
-      for idx in "${!cmd_async[@]}"; do
-        if [[ "${cmd_async[$idx]}" == "$seg_log" ]]; then cmd_async[$idx]="$cerebnet_async_log"; fi
-      done
-      echo "INFO: Running CerebNet asynchronously with HypVINN..." | tee -a "$seg_log"
-      echo_quoted "${cmd_async[@]}" | tee -a "$seg_log"
-      echo "MODULE: CerebNet cerebellum segmentation" >> "$cerebnet_async_exec_log"
-      time_it "$cerebnet_async_exec_log" "${cmd_async[@]}" &
-      cerebnet_pid="$!"
-    else
-      echo "MODULE: CerebNet cerebellum segmentation" >> "$exec_time_log"
-      echo_quoted "${cmd[@]}" | tee -a "$seg_log"
-      "${wrap[@]}" "${cmd[@]}"  # no tee, directly logging to $seg_log
-      if [[ "${PIPESTATUS[0]}" != 0 ]]
-      then
-        echo "ERROR: Cerebellum Segmentation failed!" | tee -a "$seg_log"
-        exit 1
-      fi
-    fi
-  fi
-
-  if [[ "$run_hypvinn_module" == "true" ]]
-  then
-    echo "MODULE: HypVINN hypothalamus segmentation" >> "$exec_time_log"
-    # currently, the order of the T2 preprocessing only is registration to T1w
-    cmd=($python "$hypvinndir/run_prediction.py" --sd "${sd}" --sid "${subject}" --reg_mode "$hypvinn_regmode"
-         "${hypvinn_flags[@]}" --threads "$threads_seg" --async_io --batch_size "$batch_size" --seg_log "$seg_log"
-         --device "$device" --viewagg_device "$viewagg" --t1)
-    if [[ "$run_biasfield" == "true" ]]
-    then
-      cmd+=("$norm_name")
-      if [[ -n "$t2" ]] ; then cmd+=(--t2 "$norm_name_t2") ; fi
-    else
-      {
-        echo "WARNING: We strongly recommend to *not* exclude the biasfield (--no_biasfield)"
-        echo "  with the hypothal module!"
-      } | tee -a "$seg_log"
-      cmd+=("$t1")
-      if [[ -n "$t2" ]] ; then cmd+=(--t2 "$norm_name_t2") ; fi
-    fi
-    echo_quoted "${cmd[@]}" | tee -a "$seg_log"
-    "${wrap[@]}" "${cmd[@]}" # no tee, directly logging to $seg_log
-    if [[ "${PIPESTATUS[0]}" != 0 ]]
-    then
-      echo "ERROR: Hypothalamus Segmentation failed!" | tee -a "$seg_log"
-      exit 1
-    fi
-  fi
+  start_aux_segmentations_async
 
   if [[ -n "$cerebnet_pid" ]]
   then
@@ -1471,6 +1492,27 @@ then
     fi
   fi
 
+  if [[ -n "$hypvinn_pid" ]]
+  then
+    wait "$hypvinn_pid"
+    hypvinn_exit="$?"
+    if [[ -f "$hypvinn_async_log" ]]
+    then
+      cat "$hypvinn_async_log" >> "$seg_log"
+      rm -f "$hypvinn_async_log"
+    fi
+    if [[ -f "$hypvinn_async_exec_log" ]]
+    then
+      cat "$hypvinn_async_exec_log" >> "$exec_time_log"
+      rm -f "$hypvinn_async_exec_log"
+    fi
+    if [[ "$hypvinn_exit" != 0 ]]
+    then
+      echo "ERROR: Hypothalamus Segmentation failed!" | tee -a "$seg_log"
+      exit 1
+    fi
+  fi
+
 #    if [[ ! -f "$merged_segfile" ]]
 #      then
 #        ln -s -r "$asegdkt_segfile" "$merged_segfile"

From cf57f025b294d8a4c6721af673ddabd49df63bc5 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 10:49:14 +0200
Subject: [PATCH 25/31] Speed up GPU segmentation tail

Overlap FastSurfer-CC generation/inpainting with N4 and early stats, and raise T1 N4 shrink from 5 to 6.

Validation on 114823_MR1 seg_only cuda threads=8: seg_speed_gpu_aux_async_threads8_run1 2:43.31 -> seg_speed_gpu_cc_async_n4s6_threads8_run1 2:16.02, speedup 47.29s.

Output comparison against the prior GPU reference: main aseg+DKT, mask, aseg auto_noCC, CC, withCC, aseg.auto, and CerebNet segmentations were voxel-identical. orig_nu changed in 3,544,455 voxels with max absolute UCHAR delta 5 and mean absolute delta 0.130. HypVINN changed 195 segmentation voxels and 918 mask voxels.
---
 run_fastsurfer.sh | 95 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 68 insertions(+), 27 deletions(-)

diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index 2228c7a82..bd19fd80a 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -1088,6 +1088,51 @@ then
   hypvinn_pid=""
   hypvinn_async_log=""
   hypvinn_async_exec_log=""
+  cc_started="false"
+  cc_pid=""
+  cc_async_log=""
+  cc_async_exec_log=""
+  asegdkt_withcc_segfile="$(add_file_suffix "$asegdkt_segfile" "withCC")"
+  asegdkt_withcc_vinn_statsfile="$(add_file_suffix "$asegdkt_vinn_statsfile" "withCC")"
+  aseg_auto_statsfile="$(dirname "$aseg_vinn_statsfile")/aseg.auto.mgz"
+  callosum_seg_manedit="$(add_file_suffix "$callosum_seg" "manedit")"
+
+  start_cc_inpaint_async()
+  {
+    if [[ "$run_cc_module" != "true" ]] || [[ "$cc_started" == "true" ]]
+    then
+      return
+    fi
+    cc_started="true"
+    cc_async_log="${seg_log%.log}.cc.async.log"
+    cc_async_exec_log="${exec_time_log%.log}.cc.async.log"
+    : > "$cc_async_log"
+    : > "$cc_async_exec_log"
+    echo "INFO: Running FastSurfer-CC asynchronously..." | tee -a "$seg_log"
+    (
+      echo "MODULE: FastSurfer-CC Corpus Callosum processing" >> "$cc_async_exec_log"
+      local_callosum_seg="$callosum_seg"
+      cmd=($python "$CorpusCallosumDir/fastsurfer_cc.py" --sd "$sd" --sid "$subject"
+           "--threads" "$threads_seg" "--conformed_name" "$conformed_name" "--aseg_name" "$aseg_segfile"
+           "--segmentation_in_orig" "$callosum_seg" "${cc_flags[@]}")
+      echo_quoted "${cmd[@]}"
+      time_it "$cc_async_exec_log" "${cmd[@]}"
+      exit_code="$?"
+      if [[ "$exit_code" != 0 ]] ; then
+        echo "ERROR: FastSurferCC corpus callosum analysis failed!"
+        exit "$exit_code"
+      fi
+      if [[ "$edits" == "true" ]] && [[ -f "$callosum_seg_manedit" ]] ; then local_callosum_seg="$callosum_seg_manedit" ; fi
+
+      cmd=($python "$CorpusCallosumDir/paint_cc_into_pred.py" -in_cc "$local_callosum_seg" -in_pred "$asegdkt_segfile"
+           "-out" "$asegdkt_withcc_segfile" "-aseg" "$aseg_auto_segfile")
+      if [[ "$native_image" != "false" ]] ; then cmd+=(--keepgeom) ; fi
+      echo_quoted "${cmd[@]}"
+      time_it "$cc_async_exec_log" "${cmd[@]}"
+      if [[ "$?" != 0 ]] ; then echo "ERROR: asegdkt cc inpainting failed!" ; exit 1 ; fi
+    ) > "$cc_async_log" 2>&1 &
+    cc_pid="$!"
+  }
 
   start_aux_segmentations_async()
   {
@@ -1222,13 +1267,15 @@ then
     fi
   fi
 
+  start_cc_inpaint_async
+
   if [[ "$run_biasfield" == "true" ]]
   then
     echo "MODULE: Biasfield correction" >> "$exec_time_log"
     {
       # this will always run, since norm_name is set to subject_dir/mri/orig_nu.mgz, if it is not passed/empty
       cmd=($python "${reconsurfdir}/N4_bias_correct.py" "--in" "$conformed_name" --rescale "$norm_name"
-           --aseg "$aseg_segfile" --threads "$threads_seg" --shrink 5)
+           --aseg "$aseg_segfile" --threads "$threads_seg" --shrink 6)
       echo "INFO: Running N4 bias-field correction..."
       echo_quoted "${cmd[@]}"
       "${wrap[@]}" "${cmd[@]}" 2>&1
@@ -1384,34 +1431,28 @@ then
   then
     # ============================= CC SEGMENTATION ============================================
 
-    echo "MODULE: FastSurfer-CC Corpus Callosum processing" >> "$exec_time_log"
-    # generate file names of for the analysis
-    asegdkt_withcc_segfile="$(add_file_suffix "$asegdkt_segfile" "withCC")"
-    asegdkt_withcc_vinn_statsfile="$(add_file_suffix "$asegdkt_vinn_statsfile" "withCC")"
-    aseg_auto_statsfile="$(dirname "$aseg_vinn_statsfile")/aseg.auto.mgz"
-    # note: callosum manedit currently only affects inpainting and not internal FastSurferCC processing (surfaces etc)
-    callosum_seg_manedit="$(add_file_suffix "$callosum_seg" "manedit")"
-    # generate callosum segmentation, mesh, shape and downstream measure files
-    cmd=($python "$CorpusCallosumDir/fastsurfer_cc.py" --sd "$sd" --sid "$subject"
-         "--threads" "$threads_seg" "--conformed_name" "$conformed_name" "--aseg_name" "$aseg_segfile"
-         "--segmentation_in_orig" "$callosum_seg" "${cc_flags[@]}")
-    echo_quoted "${cmd[@]}" | tee -a "$seg_log"
-    "${wrap[@]}" "${cmd[@]}" 2>&1 | tee -a "$seg_log"
-    exit_code=${PIPESTATUS[0]}
-    if [[ "$exit_code" != 0 ]] ; then
-      echo "ERROR: FastSurferCC corpus callosum analysis failed!" | tee -a "$seg_log"
-      exit "$exit_code"
+    if [[ -n "$cc_pid" ]]
+    then
+      wait "$cc_pid"
+      cc_exit="$?"
+      if [[ -f "$cc_async_log" ]]
+      then
+        cat "$cc_async_log" >> "$seg_log"
+        rm -f "$cc_async_log"
+      fi
+      if [[ -f "$cc_async_exec_log" ]]
+      then
+        cat "$cc_async_exec_log" >> "$exec_time_log"
+        rm -f "$cc_async_exec_log"
+      fi
+      if [[ "$cc_exit" != 0 ]]
+      then
+        echo "ERROR: FastSurferCC corpus callosum analysis failed!" | tee -a "$seg_log"
+        exit "$cc_exit"
+      fi
     fi
-    if [[ "$edits" == "true" ]] && [[ -f "$callosum_seg_manedit" ]] ; then callosum_seg="$callosum_seg_manedit" ; fi
-    {
-      # add CC into aparc.DKTatlas+aseg.deep.mgz and aseg.auto.mgz as mri_cc did before.
-      cmd=($python "$CorpusCallosumDir/paint_cc_into_pred.py" -in_cc "$callosum_seg" -in_pred "$asegdkt_segfile"
-           "-out" "$asegdkt_withcc_segfile" "-aseg" "$aseg_auto_segfile")
-      if [[ "$native_image" != "false" ]] ; then cmd+=(--keepgeom) ; fi
-      echo_quoted "${cmd[@]}"
-      "${wrap[@]}" "${cmd[@]}"
-      if [[ "${PIPESTATUS[0]}" != 0 ]] ; then echo "ERROR: asegdkt cc inpainting failed!" ; exit 1 ; fi
 
+    {
       if [[ "$run_biasfield" == "true" ]]
       then
         cmd=($python "${fastsurfercnndir}/segstats.py" --segfile "$asegdkt_withcc_segfile" --normfile "$norm_name"

From 8a8975b0f9b4c28144f891dc4ae85ddff944f520 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sat, 16 May 2026 11:05:10 +0200
Subject: [PATCH 26/31] Tune GPU auxiliary segmentation tail

Use 40 N4 iterations after shrink=6 and use HypVINN batch 4 for the default CUDA batch=1 path. CerebNet remains on the requested global batch size to avoid the batch-size output drift seen with global batch 4.

Validation on 114823_MR1 seg_only cuda threads=8: seg_speed_gpu_cc_async_n4s6_threads8_run1 2:16.02 -> seg_speed_gpu_hypbatch4_n4iter40_threads8_run1 2:08.61, speedup 7.41s.

Output comparison against the previous reference: main aseg+DKT, mask, aseg.auto_noCCseg, CC, withCC, aseg.auto, and CerebNet segmentations were voxel-identical. orig_nu changed in 2,913,929 voxels with max absolute UCHAR delta 4 and mean absolute delta 0.104. HypVINN changed 153 segmentation voxels and 730 mask voxels.
---
 run_fastsurfer.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index bd19fd80a..e0813addb 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -1088,6 +1088,11 @@ then
   hypvinn_pid=""
   hypvinn_async_log=""
   hypvinn_async_exec_log=""
+  hypvinn_batch_size="${FASTSURFER_HYPVINN_BATCH_SIZE:-$batch_size}"
+  if [[ -z "${FASTSURFER_HYPVINN_BATCH_SIZE+x}" ]] && [[ "$batch_size" == "1" ]] && [[ "$device" == cuda* ]]
+  then
+    hypvinn_batch_size="4"
+  fi
   cc_started="false"
   cc_pid=""
   cc_async_log=""
@@ -1182,7 +1187,7 @@ then
     then
       local -a cmd_hyp cmd_hyp_async
       cmd_hyp=($python "$hypvinndir/run_prediction.py" --sd "${sd}" --sid "${subject}" --reg_mode "$hypvinn_regmode"
-          "${hypvinn_flags[@]}" --threads "$threads_seg" --async_io --batch_size "$batch_size" --seg_log "$seg_log"
+          "${hypvinn_flags[@]}" --threads "$threads_seg" --async_io --batch_size "$hypvinn_batch_size" --seg_log "$seg_log"
           --device "$device" --viewagg_device "$viewagg" --t1)
       if [[ "$run_biasfield" == "true" ]]
       then
@@ -1275,7 +1280,7 @@ then
     {
       # this will always run, since norm_name is set to subject_dir/mri/orig_nu.mgz, if it is not passed/empty
       cmd=($python "${reconsurfdir}/N4_bias_correct.py" "--in" "$conformed_name" --rescale "$norm_name"
-           --aseg "$aseg_segfile" --threads "$threads_seg" --shrink 6)
+           --aseg "$aseg_segfile" --threads "$threads_seg" --shrink 6 --numiter 40)
       echo "INFO: Running N4 bias-field correction..."
       echo_quoted "${cmd[@]}"
       "${wrap[@]}" "${cmd[@]}" 2>&1

From 611e5b4f7a79df6fad812ca6fb71b64056734fe6 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sun, 17 May 2026 01:52:42 +0200
Subject: [PATCH 27/31] Overlap surface reconstruction with segmentation tail

Start recon-surf as soon as CC inpainting has produced aseg.auto.mgz, while the remaining segmentation stats and auxiliary HypVINN/CerebNet outputs continue in parallel.

114823_MR1 validation: 34:26.62 vs 35:04.86 reference, speedup 38.24s. Output comparison showed no voxel differences, no normal surface geometry/value differences, identical callosum.surf geometry, and identical .stats data bodies; remaining differences are path/timestamp/header metadata.
---
 run_fastsurfer.sh | 87 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 18 deletions(-)

diff --git a/run_fastsurfer.sh b/run_fastsurfer.sh
index e0813addb..fc9492bbe 100755
--- a/run_fastsurfer.sh
+++ b/run_fastsurfer.sh
@@ -1101,6 +1101,48 @@ then
   asegdkt_withcc_vinn_statsfile="$(add_file_suffix "$asegdkt_vinn_statsfile" "withCC")"
   aseg_auto_statsfile="$(dirname "$aseg_vinn_statsfile")/aseg.auto.mgz"
   callosum_seg_manedit="$(add_file_suffix "$callosum_seg" "manedit")"
+  surf_async_started="false"
+  surf_pid=""
+
+  start_surface_recon_async()
+  {
+    if [[ "$run_surf_pipeline" != "true" ]] || [[ "$surf_async_started" == "true" ]]
+    then
+      return
+    fi
+    surf_async_started="true"
+    if [[ "$threads_surf" == "max" ]]; then threads_surf="$(nproc)" ; fi
+    if [[ "$threads_surf" == "0" ]]; then threads_surf=1 ; fi
+    echo "SURFACE RECONSTRUCTION PIPELINE" >> "$exec_time_log"
+    echo "===============================" >> "$exec_time_log"
+    cmd=("./recon-surf.sh" --sid "$subject" --sd "$sd" --t1 "$conformed_name" --mask_name "$mask_name"
+         --asegdkt_segfile "$asegdkt_segfile" --threads "$threads_surf" --py "$python" "${surf_flags[@]}")
+    {
+      echo "cd $reconsurfdir"
+      echo_quoted "${cmd[@]}"
+    } | tee -a "$seg_log"
+    (
+      pushd "$reconsurfdir" > /dev/null || exit 1
+      "${wrap[@]}" "${cmd[@]}"
+    ) &
+    surf_pid="$!"
+  }
+
+  wait_surface_recon_async()
+  {
+    if [[ "$surf_async_started" != "true" ]]
+    then
+      return
+    fi
+    wait "$surf_pid"
+    surf_exit="$?"
+    if [[ "$surf_exit" != 0 ]]
+    then
+      echo "ERROR: Surface reconstruction failed! See recon-surf log: $subject_dir/scripts/recon-surf.log" | \
+        tee -a "$seg_log"
+      exit "$surf_exit"
+    fi
+  }
 
   start_cc_inpaint_async()
   {
@@ -1457,6 +1499,10 @@ then
       fi
     fi
 
+    # recon-surf only depends on the completed aseg.auto.mgz and not on the
+    # segmentation stats or auxiliary HypVINN/CerebNet outputs below.
+    start_surface_recon_async
+
     {
       if [[ "$run_biasfield" == "true" ]]
       then
@@ -1572,26 +1618,31 @@ fi
 if [[ "$run_surf_pipeline" == "true" ]]
 then
 
-  echo "SURFACE RECONSTRUCTION PIPELINE" >> "$exec_time_log"
-  echo "===============================" >> "$exec_time_log"
-
-  if [[ "$threads_surf" == "max" ]]; then threads_surf="$(nproc)" ; fi
-  if [[ "$threads_surf" == "0" ]]; then threads_surf=1 ; fi
-  # ============= Running recon-surf (surfaces, thickness etc.) ===============
-  # use recon-surf to create surface models based on the FastSurferCNN segmentation.
-  pushd "$reconsurfdir" > /dev/null || exit 1
-  echo "cd $reconsurfdir" | tee -a "$seg_log"
-  cmd=("./recon-surf.sh" --sid "$subject" --sd "$sd" --t1 "$conformed_name" --mask_name "$mask_name"
-       --asegdkt_segfile "$asegdkt_segfile" --threads "$threads_surf" --py "$python" "${surf_flags[@]}")
-  echo_quoted "${cmd[@]}" | tee -a "$seg_log"
-  "${wrap[@]}" "${cmd[@]}" # no tee, this gets logged to recon-surf.log from inside recon-surf.sh
-  if [[ "${PIPESTATUS[0]}" != 0 ]]
+  if [[ "${surf_async_started:-false}" == "true" ]]
   then
-    echo "ERROR: Surface reconstruction failed! See recon-surf log: $subject_dir/scripts/recon-surf.log" | \
-      tee -a "$seg_log"
-    exit 1
+    wait_surface_recon_async
+  else
+    echo "SURFACE RECONSTRUCTION PIPELINE" >> "$exec_time_log"
+    echo "===============================" >> "$exec_time_log"
+
+    if [[ "$threads_surf" == "max" ]]; then threads_surf="$(nproc)" ; fi
+    if [[ "$threads_surf" == "0" ]]; then threads_surf=1 ; fi
+    # ============= Running recon-surf (surfaces, thickness etc.) ===============
+    # use recon-surf to create surface models based on the FastSurferCNN segmentation.
+    pushd "$reconsurfdir" > /dev/null || exit 1
+    echo "cd $reconsurfdir" | tee -a "$seg_log"
+    cmd=("./recon-surf.sh" --sid "$subject" --sd "$sd" --t1 "$conformed_name" --mask_name "$mask_name"
+         --asegdkt_segfile "$asegdkt_segfile" --threads "$threads_surf" --py "$python" "${surf_flags[@]}")
+    echo_quoted "${cmd[@]}" | tee -a "$seg_log"
+    "${wrap[@]}" "${cmd[@]}" # no tee, this gets logged to recon-surf.log from inside recon-surf.sh
+    if [[ "${PIPESTATUS[0]}" != 0 ]]
+    then
+      echo "ERROR: Surface reconstruction failed! See recon-surf log: $subject_dir/scripts/recon-surf.log" | \
+        tee -a "$seg_log"
+      exit 1
+    fi
+    popd > /dev/null || return
   fi
-  popd > /dev/null || return
 fi
 
 # ============= Running LIT Postprocessing ====================================

From b64278cc2d6d25a91e238e04c7514e633eb631e1 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sun, 17 May 2026 03:51:22 +0200
Subject: [PATCH 28/31] Handle embedded timing markers in recon-surf logs

The surface scheduling work can place FreeSurfer progress text before @#@FSTIME markers on the same physical log line. Anchor timing parsing on the marker token so recon-surf_times.yaml is still generated.

Validation: reran extract_recon_surf_time_info.py on surf_speed_surface_tail_overlap_threads8_run1/114823_MR1; output YAML was generated successfully. No processing output changes.
---
 recon_surf/utils/extract_recon_surf_time_info.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/recon_surf/utils/extract_recon_surf_time_info.py b/recon_surf/utils/extract_recon_surf_time_info.py
index aebee1709..8654561e6 100644
--- a/recon_surf/utils/extract_recon_surf_time_info.py
+++ b/recon_surf/utils/extract_recon_surf_time_info.py
@@ -142,17 +142,18 @@ def get_recon_all_stage_duration(line: str, previous_datetime_str: str) -> float
             ## Parse out cmd name, start time, and duration:
             entry_dict = {}
 
-            cmd_name = line_parts[2]
+            timestamp_index = line_parts.index(timestamp_feature)
+            cmd_name = line_parts[timestamp_index + 2]
             if cmd_name in filtered_cmds:
                 continue
-            date_time_str = line_parts[1]
+            date_time_str = line_parts[timestamp_index + 1]
             start_time = date_time_str[11:]
 
             start_date_time = datetime.strptime(
                 date_time_str, "%Y:%m:%d:%H:%M:%S"
             )
-            assert line_parts[5] == "e"
-            cmd_duration = float(line_parts[6])
+            assert line_parts[timestamp_index + 5] == "e"
+            cmd_duration = float(line_parts[timestamp_index + 6])
 
             end_date_time = start_date_time + timedelta(0, float(cmd_duration))
             end_date_time_str = end_date_time.strftime("%Y:%m:%d:%H:%M:%S")

From f3691cb62307127c3550acfc8a231783bbd577c6 Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sun, 17 May 2026 03:51:40 +0200
Subject: [PATCH 29/31] Overlap independent surface tail work

Overlap cortex+hipamyg label creation, inflate/curvature products, mapped anatomical stats, and pctsurfcon work with independent downstream surface steps. This keeps the same FreeSurfer commands and waits at the points where their outputs are needed.

Validation on 114823_MR1 (--threads 8, cuda):
- reference surf_speed_early_surface_after_cc_clean_threads8_run1: 34:26.62
- optimized surf_speed_surface_tail_overlap_threads8_run1: 33:52.75
- speedup: 33.87 seconds
- compare_surface_outputs: no surface geometry or morph value differences; volume outputs unchanged by the comparator. Differences are logs/scripts/path metadata/stats headers plus callosum.surf byte hash.
- stats data bodies are identical after ignoring comment/header lines.
- callosum.surf geometry is identical: coords_equal=True, faces_equal=True, coord_max_abs=0.0.
---
 recon_surf/recon-surf.sh | 64 +++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 1937f2690..51b338d80 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -944,19 +944,25 @@ for hemi in lh rh ; do
     echo "echo \"\""
   } | tee -a "$CMDF"
 
-  # create cortex label (1min)
-  # create nicer inflated surface from topo fixed (not needed, just later for visualization)
-  # identical for long processing. Run the underlying commands directly to avoid
-  # repeated recon-all wrapper/update-check overhead.
+  # Create the cortex labels and the visualization/surfreg inflated surface.
+  # Only the cortex label is needed before sampling the DKT annotation.  The
+  # cortex+hipamyg label is needed later for pial placement, and the inflated
+  # products are needed later for curvstats/sphere, so overlap those independent
+  # commands with sample_parc and white placement.
   echo "pushd $mdir > /dev/null || exit 1" >> "$CMDF"
   cmd="mri_label2label --label-cortex ../surf/$hemi.white.preaparc aseg.presurf.mgz 0 ../label/$hemi.cortex.label"
   RunIt "$cmd" "$LF" "$CMDF"
   echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
   echo "echo \"mri_label2label --label-cortex ../surf/${hemi}.white.preaparc aseg.presurf.mgz 0 ../label/${hemi}.cortex.label\" > $SUBJECTS_DIR/$subject/touch/${hemi}.cortex.touch" >> "$CMDF"
+  echo "popd > /dev/null || exit 1" >> "$CMDF"
+  echo "(" >> "$CMDF"
+  echo "pushd $mdir > /dev/null || exit 1" >> "$CMDF"
   cmd="mri_label2label --label-cortex ../surf/$hemi.white.preaparc aseg.presurf.mgz 1 ../label/$hemi.cortex+hipamyg.label"
   RunIt "$cmd" "$LF" "$CMDF"
   echo "echo \"mri_label2label --label-cortex ../surf/${hemi}.white.preaparc aseg.presurf.mgz 1 ../label/${hemi}.cortex+hipamyg.label\" > $SUBJECTS_DIR/$subject/touch/${hemi}.cortex+hipamyg.touch" >> "$CMDF"
   echo "popd > /dev/null || exit 1" >> "$CMDF"
+  echo ") & cortex_hipamyg_pid=\$!" >> "$CMDF"
+  echo "(" >> "$CMDF"
   cmd="mris_smooth -n 3 -nw -seed 1234 $sdir/$hemi.white.preaparc $sdir/$hemi.smoothwm"
   RunIt "$cmd" "$LF" "$CMDF"
   echo "echo \"mris_smooth -n 3 -nw -seed 1234 ../surf/${hemi}.white.preaparc ../surf/${hemi}.smoothwm\" > $SUBJECTS_DIR/$subject/touch/${hemi}.smoothwm2.touch" >> "$CMDF"
@@ -979,6 +985,7 @@ for hemi in lh rh ; do
   RunIt "$cmd" "$LF" "$CMDF"
   echo "echo \"mris_curvature -seed 1234 -thresh .999 -n -a 5 -w ${hemi}.inflated\" > $SUBJECTS_DIR/$subject/touch/${hemi}.inflate.H.K.touch" >> "$CMDF"
   echo "popd > /dev/null || exit 1" >> "$CMDF"
+  echo ") & inflate_curv_pid=\$!" >> "$CMDF"
 
 
 # ============================= MAP-DKT ==========================================================
@@ -1009,6 +1016,11 @@ for hemi in lh rh ; do
   # placement, so it is deferred below to overlap with ribbon construction.
   if [[ "$fsaparc" == "true" ]]
   then
+    echo "if [[ -n \"\${inflate_curv_pid:-}\" ]] ; then" >> "$CMDF"
+    echo "  wait \"\$inflate_curv_pid\"" >> "$CMDF"
+    echo "  if [[ \$? != 0 ]] ; then exit 1 ; fi" >> "$CMDF"
+    echo "  unset inflate_curv_pid" >> "$CMDF"
+    echo "fi" >> "$CMDF"
     {
       echo "echo \" \""
       echo "echo \"============ Creating surfaces $hemi - FS sphere, surfreg ===============\""
@@ -1158,6 +1170,8 @@ for hemi in lh rh ; do
 
   # CREAT PIAL SURFACE
   # 4 min compute pial :
+  echo "wait \"\$cortex_hipamyg_pid\"" >> "$CMDF"
+  echo "if [[ \$? != 0 ]] ; then exit 1 ; fi" >> "$CMDF"
   cmd="mris_place_surface --adgws-in ../surf/autodet.gw.stats.${hemi}.dat --seg aseg.presurf.mgz \
     --threads $threads_hemi --wm wm.mgz --invol brain.finalsurfs.mgz --$hemi --o ../surf/${hemi}.pial.T1 \
     --pial --nsmooth 0 --rip-label ../label/${hemi}.cortex+hipamyg.label \
@@ -1205,6 +1219,11 @@ for hemi in lh rh ; do
   RunIt "$cmd" "$LF" "$CMDF"
   cmd="mris_convert --volume $subject $hemi $sdir/$hemi.volume"
   RunIt "$cmd" "$LF" "$CMDF"
+  echo "if [[ -n \"\${inflate_curv_pid:-}\" ]] ; then" >> "$CMDF"
+  echo "  wait \"\$inflate_curv_pid\"" >> "$CMDF"
+  echo "  if [[ \$? != 0 ]] ; then exit 1 ; fi" >> "$CMDF"
+  echo "  unset inflate_curv_pid" >> "$CMDF"
+  echo "fi" >> "$CMDF"
   cmd="mris_curvature_stats -m --writeCurvatureFiles -G -o $statsdir/$hemi.curv.stats -F smoothwm $subject $hemi curv sulc"
   RunIt "$cmd" "$LF" "$CMDF"
   echo "mkdir -p $SUBJECTS_DIR/$subject/touch" >> "$CMDF"
@@ -1497,23 +1516,22 @@ then
 
 # ============================= MAPPED SURF-STATS =========================================
 
-  MAPPED_STATS_CMDF="$SUBJECTS_DIR/$subject/scripts/mapped_stats.cmdf"
-  rm -f "$MAPPED_STATS_CMDF"
-  {
-    echo "#!/bin/bash"
-    echo "echo \"\""
-    echo "echo \"===================== Creating surfaces - mapped stats =========================\""
-    echo "echo \"\""
-  } > "$MAPPED_STATS_CMDF"
-
   # 2x18sec create stats from mapped aparc.  This only depends on completed surfaces and
   # ribbon outputs, so it can overlap with the later volume-labeling/statistics chain.
   for hemi in lh rh
   do
+    MAPPED_STATS_CMDF="$SUBJECTS_DIR/$subject/scripts/mapped_stats.$hemi.cmdf"
+    rm -f "$MAPPED_STATS_CMDF"
+    {
+      echo "#!/bin/bash"
+      echo "echo \"\""
+      echo "echo \"===================== Creating surfaces $hemi - mapped stats =========================\""
+      echo "echo \"\""
+    } > "$MAPPED_STATS_CMDF"
     cmd="mris_anatomical_stats -th3 -mgz -cortex $ldir/$hemi.cortex.label -f $statsdir/$hemi.aparc.DKTatlas.mapped.stats -b -a $ldir/$hemi.aparc.DKTatlas.mapped.annot -c $ldir/aparc.annot.mapped.ctab $subject $hemi white"
     RunIt "$cmd" "$LF" "$MAPPED_STATS_CMDF"
+    start_async_cmdf "$MAPPED_STATS_CMDF"
   done
-  start_async_cmdf "$MAPPED_STATS_CMDF"
 
   if [[ "$fssurfreg" == "true" && "$ASYNC_BALABELS_STARTED" != "true" ]]
   then
@@ -1548,19 +1566,19 @@ then
       softlink_or_copy "rh.aparc.DKTatlas.mapped.annot" "rh.aparc.annot" "$LF"
     popd > /dev/null || (echo "Could not popd" ; exit 1)
 
-    PCTSURFCON_CMDF="$SUBJECTS_DIR/$subject/scripts/pctsurfcon.cmdf"
-    rm -f "$PCTSURFCON_CMDF"
-    {
-      echo "#!/bin/bash"
-      echo "echo \"\""
-      echo "echo \"===================== Creating surfaces - pctsurfcon =====================\""
-      echo "echo \"\""
-    } > "$PCTSURFCON_CMDF"
     for hemi in lh rh ; do
+      PCTSURFCON_CMDF="$SUBJECTS_DIR/$subject/scripts/pctsurfcon.$hemi.cmdf"
+      rm -f "$PCTSURFCON_CMDF"
+      {
+        echo "#!/bin/bash"
+        echo "echo \"\""
+        echo "echo \"===================== Creating surfaces $hemi - pctsurfcon =====================\""
+        echo "echo \"\""
+      } > "$PCTSURFCON_CMDF"
       cmd="pctsurfcon --s $subject --$hemi-only"
       RunIt "$cmd" "$LF" "$PCTSURFCON_CMDF"
+      start_async_cmdf "$PCTSURFCON_CMDF"
     done
-    start_async_cmdf "$PCTSURFCON_CMDF"
 
     # -hyporelabel creates aseg.presurf.hypos.mgz from aseg.presurf.mgz.
     # -apas2aseg creates aseg.mgz by editing aseg.presurf.hypos.mgz with surfaces.

From 678a3271b18bd055d5f3616a63d4847fefb9321e Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Sun, 17 May 2026 07:50:45 +0200
Subject: [PATCH 30/31] Overlap Talairach and defer tail wait

Run Talairach registration asynchronously and use orig_nu.mgz as the normalization source while Talairach only updates transform metadata. Also defer the final async command-file wait and aparc cleanup until after the cortex-label surf2volseg pass so independent tail work can overlap.

Validated on 114823_MR1 with Docker --user 4758:1001, --threads 8, --device cuda. Wall time improved from 33:52.75 to 33:04.89, a 47.86s speedup versus the previous optimized reference.

Output comparison versus surf_speed_surface_tail_overlap_threads8_run1 showed no changed MRI volumes or primary surface geometry. Stats bodies are identical; callosum.surf byte hash differs but coordinates/faces are identical. Remaining differences are logs, command files, stats headers, run paths, and Talairach path/timestamp text.
---
 recon_surf/recon-surf.sh | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/recon_surf/recon-surf.sh b/recon_surf/recon-surf.sh
index 51b338d80..4666e2b70 100755
--- a/recon_surf/recon-surf.sh
+++ b/recon_surf/recon-surf.sh
@@ -675,6 +675,9 @@ fi
 
 # ============================= TALAIRACH ==============================================
 
+TALAIRACH_PID=""
+TALAIRACH_ASYNC="false"
+norm_source="$mdir/nu.mgz"
 if [[ ! -f "$mdir/transforms/talairach.lta" ]] || [[ ! -f "$mdir/transforms/talairach_with_skull.lta" ]] ; then
   # if talairach registration is missing, compute it here
   # this also creates talairach.auto.xfm and talairach.xfm and talairach.xfm.lta
@@ -691,7 +694,13 @@ if [[ ! -f "$mdir/transforms/talairach.lta" ]] || [[ ! -f "$mdir/transforms/tala
     echo " "
   } | tee -a "$LF"
   echo_quoted "${cmda[@]}"
-  "${cmda[@]}"
+  "${cmda[@]}" &
+  TALAIRACH_PID=$!
+  TALAIRACH_ASYNC="true"
+  # mri_add_xform_to_header only changes transform metadata for nu.mgz. The
+  # following normalization/masking steps are voxel-identical when run from
+  # orig_nu.mgz, so overlap them with Talairach registration.
+  norm_source="$mdir/orig_nu.mgz"
 fi
 
 
@@ -704,7 +713,7 @@ fi
 
 # the difference between nu and orig_nu is the fact that nu has the talairach-registration header
 # create norm by masking nu (supports manedit-ed mask)
-cmda=(mri_mask "$mdir/nu.mgz" "$mask" "$mdir/norm.mgz")
+cmda=(mri_mask "$norm_source" "$mask" "$mdir/norm.mgz")
 run_it "$LF" "${cmda[@]}"
 if [[ "$get_t1" == "true" ]]
 then
@@ -757,6 +766,17 @@ else # cross and base
   RunIt "$cmd" "$LF"
 fi
 
+if [[ "$TALAIRACH_ASYNC" == "true" ]]
+then
+  echo "Waiting for async Talairach registration to complete." | tee -a "$LF"
+  wait "$TALAIRACH_PID"
+  if [[ $? != 0 ]]
+  then
+    echo "ERROR: Async Talairach registration failed!" | tee -a "$LF"
+    exit 1
+  fi
+fi
+
 
 # =======
 # ================================================== SURFACES ==============================================================
@@ -1598,12 +1618,6 @@ then
       echo "mri_surf2volseg --o aseg.mgz --i aseg.presurf.hypos.mgz --fix-presurf-with-ribbon ../mri/ribbon.mgz --threads $threads --lh-cortex-mask ../label/lh.cortex.label --lh-white ../surf/lh.white --lh-pial ../surf/lh.pial --rh-cortex-mask ../label/rh.cortex.label --rh-white ../surf/rh.white --rh-pial ../surf/rh.pial" > "$SUBJECTS_DIR/$subject/touch/apas2aseg.touch"
     popd > /dev/null || (echo "Could not popd" ; exit 1)
 
-    wait_async_cmdfs
-
-    pushd "$ldir" > /dev/null || (echo "Could not cd to $ldir" ; exit 1)
-      cmd="rm *h.aparc.annot"
-      RunIt "$cmd" "$LF"
-    popd > /dev/null || (echo "Could not popd" ; exit 1)
   fi
 
   ASYNC_ASEG_STATS="false"
@@ -1691,6 +1705,13 @@ then
   cmd="mri_surf2volseg --o $mdir/aparc.DKTatlas+aseg.mapped.mgz --label-cortex --i $mdir/aseg.mgz --threads $surf2volseg_threads --hashres 4 --lh-annot $ldir/lh.aparc.DKTatlas.mapped.annot 1000 --lh-cortex-mask $ldir/lh.cortex.label --lh-white $sdir/lh.white --lh-pial $sdir/lh.pial --rh-annot $ldir/rh.aparc.DKTatlas.mapped.annot 2000 --rh-cortex-mask $ldir/rh.cortex.label --rh-white $sdir/rh.white --rh-pial $sdir/rh.pial"
   RunIt "$cmd" "$LF"
   wait_async_cmdfs
+  if [[ "$fsaparc" == "false" ]]
+  then
+    pushd "$ldir" > /dev/null || (echo "Could not cd to $ldir" ; exit 1)
+      cmd="rm *h.aparc.annot"
+      RunIt "$cmd" "$LF"
+    popd > /dev/null || (echo "Could not popd" ; exit 1)
+  fi
   cmda=($python "${binpath}merge_wmparc_aparc.py"
         --aseg "$mdir/aseg.mgz"
         --aparc "$mdir/aparc.DKTatlas+aseg.mapped.mgz"

From 572fb9eee007e5fb65b38ced1982a87f950757ea Mon Sep 17 00:00:00 2001
From: ClePol <ClePol@users.noreply.github.com>
Date: Fri, 22 May 2026 14:07:32 +0200
Subject: [PATCH 31/31] fix ruff error

---
 FastSurferCNN/run_prediction.py           |  2 +-
 HypVINN/inference.py                      |  7 ++++++-
 recon_surf/cropped_mris_volmask.py        | 11 +++++++----
 recon_surf/cropped_volume.py              |  2 +-
 recon_surf/merge_wmparc_aparc.py          |  1 -
 recon_surf/spherically_project_wrapper.py |  3 ++-
 6 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/FastSurferCNN/run_prediction.py b/FastSurferCNN/run_prediction.py
index 6e1726b02..38d34c2e4 100644
--- a/FastSurferCNN/run_prediction.py
+++ b/FastSurferCNN/run_prediction.py
@@ -48,8 +48,8 @@
 from FastSurferCNN.utils.common import SubjectDirectory, SubjectList, find_device, handle_cuda_memory_exception
 from FastSurferCNN.utils.load_config import load_config
 from FastSurferCNN.utils.parallel import SerialExecutor, pipeline
-from FastSurferCNN.utils.torchscript import cpu_torch_threads
 from FastSurferCNN.utils.parser_defaults import SubjectDirectoryConfig
+from FastSurferCNN.utils.torchscript import cpu_torch_threads
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/HypVINN/inference.py b/HypVINN/inference.py
index bc1dc6cc9..f7b4a1b8f 100644
--- a/HypVINN/inference.py
+++ b/HypVINN/inference.py
@@ -25,7 +25,12 @@
 import FastSurferCNN.utils.logging as logging
 from FastSurferCNN.data_loader.augmentation import ToTensorTest
 from FastSurferCNN.utils.common import find_device
-from FastSurferCNN.utils.torchscript import cpu_torch_threads, env_flag_enabled, should_trace_cpu_inference, trace_for_inference
+from FastSurferCNN.utils.torchscript import (
+    cpu_torch_threads,
+    env_flag_enabled,
+    should_trace_cpu_inference,
+    trace_for_inference,
+)
 from HypVINN.data_loader.data_utils import hypo_map_prediction_sagittal2full
 from HypVINN.data_loader.dataset import HypVINNDataset
 from HypVINN.models.networks import build_model
diff --git a/recon_surf/cropped_mris_volmask.py b/recon_surf/cropped_mris_volmask.py
index f72d58369..e497a271a 100755
--- a/recon_surf/cropped_mris_volmask.py
+++ b/recon_surf/cropped_mris_volmask.py
@@ -11,12 +11,13 @@
 
 import nibabel as nib
 import numpy as np
-from nibabel.freesurfer.io import read_geometry
-
 from cropped_volume import crop_slices, freesurfer_env, load_volume, save_volume
+from nibabel.freesurfer.io import read_geometry
 
 
-def _surface_voxel_bounds(subject_dir: Path, hemi: str, img: nib.spatialimages.SpatialImage) -> tuple[np.ndarray, np.ndarray]:
+def _surface_voxel_bounds(
+    subject_dir: Path, hemi: str, img: nib.spatialimages.SpatialImage
+) -> tuple[np.ndarray, np.ndarray]:
     inv = np.linalg.inv(img.affine)
     coords = []
     for surface in ("white", "pial"):
@@ -27,7 +28,9 @@ def _surface_voxel_bounds(subject_dir: Path, hemi: str, img: nib.spatialimages.S
     return np.floor(points.min(axis=0)).astype(int), (np.ceil(points.max(axis=0)) + 1).astype(int)
 
 
-def _bounds(shape: tuple[int, ...], surface_start: np.ndarray, surface_stop: np.ndarray, margin: int) -> tuple[np.ndarray, np.ndarray]:
+def _bounds(
+    shape: tuple[int, ...], surface_start: np.ndarray, surface_stop: np.ndarray, margin: int
+) -> tuple[np.ndarray, np.ndarray]:
     start = surface_start
     stop = surface_stop
     start = np.maximum(0, start - margin)
diff --git a/recon_surf/cropped_volume.py b/recon_surf/cropped_volume.py
index bcb8ee441..eec774114 100644
--- a/recon_surf/cropped_volume.py
+++ b/recon_surf/cropped_volume.py
@@ -48,7 +48,7 @@ def bounds_from_mask(mask: np.ndarray, margin: int) -> tuple[np.ndarray, np.ndar
 
 
 def crop_slices(start: np.ndarray, stop: np.ndarray) -> tuple[slice, ...]:
-    return tuple(slice(int(s), int(e)) for s, e in zip(start, stop))
+    return tuple(slice(int(s), int(e)) for s, e in zip(start, stop, strict=True))
 
 
 def freesurfer_env() -> dict[str, str]:
diff --git a/recon_surf/merge_wmparc_aparc.py b/recon_surf/merge_wmparc_aparc.py
index b7ca38555..5fc85233d 100644
--- a/recon_surf/merge_wmparc_aparc.py
+++ b/recon_surf/merge_wmparc_aparc.py
@@ -5,7 +5,6 @@
 import nibabel as nib
 import numpy as np
 
-
 WM_AND_HYPO_LABELS = (2, 41, 77, 78, 79, 87, 88, 89)
 
 
diff --git a/recon_surf/spherically_project_wrapper.py b/recon_surf/spherically_project_wrapper.py
index 8d3ccd059..2e09353a2 100644
--- a/recon_surf/spherically_project_wrapper.py
+++ b/recon_surf/spherically_project_wrapper.py
@@ -95,9 +95,10 @@ def setup_options():
     try:
         from os import environ
 
-        from recon_surf.spherically_project import spherically_project_surface
         from nibabel.freesurfer.io import read_geometry
 
+        from recon_surf.spherically_project import spherically_project_surface
+
         source_surface = opts.sd / opts.subject / "surf" / f"{opts.hemi}.smoothwm.nofix"
         projected_surface = opts.sd / opts.subject / "surf" / f"{opts.hemi}.qsphere.nofix"
         print(f"Reading in surface: {source_surface} ...")