Files
streamdeck-go/systemd/streamdeck-go-watchdog.sh
2026-05-10 13:35:16 -06:00

178 lines
6.5 KiB
Bash

#!/usr/bin/env bash
# streamdeck-go watchdog — runs every 30s via systemd timer (Linux) or launchd
# StartInterval (macOS).
#
# Why this exists: when the Stream Deck is unplugged and replugged, the daemon's
# in-process reconnect logic does not always notice. On Linux, hidraw can keep
# returning read timeouts on the now-stale fd instead of surfacing an error, so
# the "3 consecutive errors → reconnect" path never triggers, and the service
# manager still reports the service as active even though the device is
# unreachable.
#
# Strategy: track the device's transient USB address (Linux: bus:device,
# macOS: Location ID). When it changes (unplug/replug) or the service is
# inactive while a device is present, restart the service.
set -euo pipefail
# Stream Deck product IDs we support (see internal/device/streamdeck.go).
PIDS_RE="00ba|006c|006d"
OS="$(uname -s)"
case "$OS" in
Linux)
STATE_DIR="${XDG_RUNTIME_DIR:-/tmp}"
;;
Darwin)
# No XDG_RUNTIME_DIR on macOS; use the user-private temp dir.
STATE_DIR="${TMPDIR:-/tmp}"
;;
*)
echo "watchdog: unsupported OS: $OS" >&2
exit 1
;;
esac
STATE_FILE="$STATE_DIR/streamdeck-go-watchdog.state"
# Print a transient identifier for the first matching Stream Deck on the USB
# bus, or empty if none is present. The identifier must change across
# unplug/replug so we can detect it.
current_addr() {
case "$OS" in
Linux)
# "Bus 003 Device 052: ID 0fd9:00ba ..." → "003:052"
lsusb 2>/dev/null | awk -v pids="$PIDS_RE" '
$0 ~ ("ID 0fd9:(" pids ")") {
gsub(":", "", $4)
print $2 ":" $4
exit
}
'
;;
Darwin)
# system_profiler entry per device:
# Stream Deck XL:
# Product ID: 0x00ba
# Vendor ID: 0x0fd9 (Elgato ...)
# ...
# Location ID: 0x14140000 / 5
# The trailing "/ N" is the bus address — it changes on replug.
system_profiler SPUSBDataType 2>/dev/null | awk -v pids="$PIDS_RE" '
/^[[:space:]]*Product ID:/ { pid = $3 }
/^[[:space:]]*Vendor ID:/ { vid = $3 }
/^[[:space:]]*Location ID:/ {
sub(/^[[:space:]]*Location ID:[[:space:]]*/, "")
if (vid == "0x0fd9" && pid ~ ("^0x(" pids ")$")) {
print
exit
}
}
'
;;
esac
}
# Is the streamdeck-go service currently active?
service_active() {
case "$OS" in
Linux)
systemctl --user is-active --quiet streamdeck-go.service
;;
Darwin)
# launchctl list prints "PID Status Label". A PID of "-" means
# the agent is loaded but not running.
local line
line="$(launchctl list 2>/dev/null | awk '$3 == "com.woodarddigital.streamdeck-go" { print $1 }')"
[[ -n "$line" && "$line" != "-" ]]
;;
esac
}
restart_service() {
case "$OS" in
Linux)
systemctl --user restart streamdeck-go.service
;;
Darwin)
# kickstart -k stops and restarts; works whether or not it's running.
launchctl kickstart -k "gui/$(id -u)/com.woodarddigital.streamdeck-go"
;;
esac
}
prev=""
[[ -f "$STATE_FILE" ]] && prev="$(cat "$STATE_FILE" 2>/dev/null || true)"
curr="$(current_addr)"
# Only update the state file when the device is present. If we overwrote with
# an empty string while the device was absent (e.g. mid-KVM-swap), the very
# next run would see prev="" and miss the address change on return.
if [[ -n "$curr" ]]; then
printf '%s' "$curr" > "$STATE_FILE"
fi
# No device present — nothing to do. Don't touch the service.
if [[ -z "$curr" ]]; then
exit 0
fi
# Linux-only: detect a stale hidraw fd held by the daemon. When the device
# unplugs, hidraw's open fd survives but its /dev node is removed; procfs
# marks the symlink "(deleted)". hid_read_timeout on this fd silently returns
# zero bytes, so the daemon's 3-error reconnect path never trips.
stale_fd_detected() {
[[ "$OS" != "Linux" ]] && return 1
local pid
pid="$(systemctl --user show -p MainPID --value streamdeck-go.service 2>/dev/null || true)"
[[ -z "$pid" || "$pid" == "0" ]] && return 1
[[ ! -d "/proc/$pid/fd" ]] && return 1
ls -la "/proc/$pid/fd/" 2>/dev/null | grep -qE 'hidraw[0-9]+ \(deleted\)'
}
# Linux-only: detect that the system resumed from suspend after the daemon
# started. On resume, the xhci controller may reset the deck's USB device
# in place (same bus address, same hidraw node, fd not deleted). The kernel
# reset leaves the existing fd's input queue dead — buttons no longer reach
# userspace — but no externally visible signal flags the failure. Restarting
# the daemon is cheap and reliably fixes it.
#
# Idempotent by construction: once we restart, the daemon's ActiveEnterTimestamp
# moves past the resume event, so this check stops firing until the next sleep.
resumed_since_start() {
[[ "$OS" != "Linux" ]] && return 1
local started
started="$(systemctl --user show -p ActiveEnterTimestamp --value streamdeck-go.service 2>/dev/null || true)"
[[ -z "$started" || "$started" == "n/a" ]] && return 1
local started_epoch
started_epoch="$(date -d "$started" +%s 2>/dev/null || true)"
[[ -z "$started_epoch" ]] && return 1
journalctl -k --since "@$started_epoch" --no-pager 2>/dev/null \
| grep -qE 'PM: suspend exit|PM: Finishing wakeup'
}
reason=""
if [[ -z "$prev" ]]; then
# First observation (or state file was wiped). Only restart if the service
# is also down — if it's already running, assume it's healthy and just
# record the baseline.
if ! service_active; then
reason="device present at $curr but service is not active"
fi
elif [[ "$curr" != "$prev" ]]; then
reason="device address changed: $prev$curr (likely unplug/replug)"
elif ! service_active; then
reason="device present at $curr but service is not active"
elif stale_fd_detected; then
reason="daemon holds a deleted hidraw fd (post-unplug stale handle)"
elif resumed_since_start; then
reason="system resumed from suspend since daemon started (USB reset may have invalidated input queue)"
fi
if [[ -n "$reason" ]]; then
echo "watchdog: $reason — restarting streamdeck-go"
restart_service
fi