From 86336fed019776098f9eaab0282fdbb958471838 Mon Sep 17 00:00:00 2001 From: Jackson Holiday Wheeler Date: Thu, 19 Feb 2026 23:59:25 +0800 Subject: [PATCH 1/2] feat: add voice-to-text push-to-talk transcription Push-to-talk system using ffmpeg + faster-whisper for local speech-to-text. Press ALT+R to start recording, press again to stop, transcribe, and type the result into the focused window via wtype. Co-Authored-By: Claude Opus 4.6 --- transcribe.py | 20 +++++++++++++++ voice-to-text | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100755 transcribe.py create mode 100755 voice-to-text diff --git a/transcribe.py b/transcribe.py new file mode 100755 index 0000000..556a37e --- /dev/null +++ b/transcribe.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +"""Transcribe a WAV file using faster-whisper (base model, CPU, int8).""" + +import sys +from faster_whisper import WhisperModel + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + model = WhisperModel("base", device="cpu", compute_type="int8") + segments, _ = model.transcribe(sys.argv[1], beam_size=5) + text = " ".join(seg.text.strip() for seg in segments) + print(text) + + +if __name__ == "__main__": + main() diff --git a/voice-to-text b/voice-to-text new file mode 100755 index 0000000..1aaae76 --- /dev/null +++ b/voice-to-text @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# +# voice-to-text — push-to-talk toggle for Wayland +# 1st press: start recording 2nd press: stop, transcribe, type into terminal +# + +set -euo pipefail + +PID_FILE="/tmp/voice-to-text.pid" +WAV_FILE="/tmp/voice-to-text.wav" +LOG_FILE="/tmp/voice-to-text.log" +TRANSCRIBE="$HOME/scripts/transcribe.py" +VENV_PYTHON="$HOME/venvs/voice-to-text/bin/python" +HINT="string:x-canonical-private-synchronous:voice-to-text" + +notify() { + local timeout="${1:--1}"; shift + notify-send -a "Voice Recorder" -t "$timeout" -h "$HINT" "$@" +} + +# ── Stop recording & transcribe ────────────────────────────────────── +if [[ -f "$PID_FILE" ]]; then + pid=$(cat "$PID_FILE") + rm -f "$PID_FILE" + + if kill -0 "$pid" 2>/dev/null; then + kill -INT "$pid" + # Poll until ffmpeg exits (can't use wait — different shell) + for _ in $(seq 1 50); do + kill -0 "$pid" 2>/dev/null || break + sleep 0.1 + done + fi + + if [[ ! -s "$WAV_FILE" ]]; then + notify 5000 "Error" "Recording is empty — nothing to transcribe." + exit 1 + fi + + notify -1 "Transcribing..." + + text=$("$VENV_PYTHON" "$TRANSCRIBE" "$WAV_FILE" 2>"$LOG_FILE") || true + + if [[ -z "$text" ]]; then + err=$(cat "$LOG_FILE" 2>/dev/null || echo "unknown error") + notify 5000 "Error" "Transcription failed: $err" + exit 1 + fi + + printf '%s' "$text" | wl-copy + wtype -- "$text" + + notify 5000 "Transcribed" "$text" + exit 0 +fi + +# ── Start recording ────────────────────────────────────────────────── + +if ! pactl info &>/dev/null; then + notify 5000 "Error" "PulseAudio/PipeWire not available." + exit 1 +fi + +rm -f "$WAV_FILE" + +ffmpeg -y -f pulse -i default -ac 1 -ar 16000 "$WAV_FILE" &>/dev/null & +echo $! > "$PID_FILE" + +notify 0 "Recording..." "Press ALT+R again to stop." From b21ce63a65f9b2edfe87a52119f9d0b5af88cccd Mon Sep 17 00:00:00 2001 From: Jackson Holiday Wheeler Date: Mon, 9 Mar 2026 14:57:30 +0800 Subject: [PATCH 2/2] chore: cleanup --- transcribe.py | 20 --------------- voice-to-text | 69 --------------------------------------------------- 2 files changed, 89 deletions(-) delete mode 100755 transcribe.py delete mode 100755 voice-to-text diff --git a/transcribe.py b/transcribe.py deleted file mode 100755 index 556a37e..0000000 --- a/transcribe.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python3 -"""Transcribe a WAV file using faster-whisper (base model, CPU, int8).""" - -import sys -from faster_whisper import WhisperModel - - -def main(): - if len(sys.argv) != 2: - print(f"Usage: {sys.argv[0]} ", file=sys.stderr) - sys.exit(1) - - model = WhisperModel("base", device="cpu", compute_type="int8") - segments, _ = model.transcribe(sys.argv[1], beam_size=5) - text = " ".join(seg.text.strip() for seg in segments) - print(text) - - -if __name__ == "__main__": - main() diff --git a/voice-to-text b/voice-to-text deleted file mode 100755 index 1aaae76..0000000 --- a/voice-to-text +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash -# -# voice-to-text — push-to-talk toggle for Wayland -# 1st press: start recording 2nd press: stop, transcribe, type into terminal -# - -set -euo pipefail - -PID_FILE="/tmp/voice-to-text.pid" -WAV_FILE="/tmp/voice-to-text.wav" -LOG_FILE="/tmp/voice-to-text.log" -TRANSCRIBE="$HOME/scripts/transcribe.py" -VENV_PYTHON="$HOME/venvs/voice-to-text/bin/python" -HINT="string:x-canonical-private-synchronous:voice-to-text" - -notify() { - local timeout="${1:--1}"; shift - notify-send -a "Voice Recorder" -t "$timeout" -h "$HINT" "$@" -} - -# ── Stop recording & transcribe ────────────────────────────────────── -if [[ -f "$PID_FILE" ]]; then - pid=$(cat "$PID_FILE") - rm -f "$PID_FILE" - - if kill -0 "$pid" 2>/dev/null; then - kill -INT "$pid" - # Poll until ffmpeg exits (can't use wait — different shell) - for _ in $(seq 1 50); do - kill -0 "$pid" 2>/dev/null || break - sleep 0.1 - done - fi - - if [[ ! -s "$WAV_FILE" ]]; then - notify 5000 "Error" "Recording is empty — nothing to transcribe." - exit 1 - fi - - notify -1 "Transcribing..." - - text=$("$VENV_PYTHON" "$TRANSCRIBE" "$WAV_FILE" 2>"$LOG_FILE") || true - - if [[ -z "$text" ]]; then - err=$(cat "$LOG_FILE" 2>/dev/null || echo "unknown error") - notify 5000 "Error" "Transcription failed: $err" - exit 1 - fi - - printf '%s' "$text" | wl-copy - wtype -- "$text" - - notify 5000 "Transcribed" "$text" - exit 0 -fi - -# ── Start recording ────────────────────────────────────────────────── - -if ! pactl info &>/dev/null; then - notify 5000 "Error" "PulseAudio/PipeWire not available." - exit 1 -fi - -rm -f "$WAV_FILE" - -ffmpeg -y -f pulse -i default -ac 1 -ar 16000 "$WAV_FILE" &>/dev/null & -echo $! > "$PID_FILE" - -notify 0 "Recording..." "Press ALT+R again to stop."