#!/bin/bash
# If QMI link is down, restart quectel-qmi. Run from cron every 15 min.
# Logs to /app/qmi-log/qmi-failover.log.

QMI_LOG_DIR="/app/qmi-log"
LOG="$QMI_LOG_DIR/qmi-failover.log"
mkdir -p "$QMI_LOG_DIR" 2>/dev/null || true
log() { echo "$(date -Iseconds) $*" >> "$LOG"; }

DEV="/dev/cdc-wdm0"
PING_TARGET="8.8.8.8"
MAX_TRIES_BEFORE_RESET=3
RESET_POLL_MAX_SEC=$((15 * 60))
RESET_POLL_INTERVAL_SEC=10
STATE_DIR="/var/lib/matelex"
STATE_FILE="$STATE_DIR/qmi-failover.state"
REBOOT_THRESHOLD=4
REBOOT_DELAY_SEC=10

log "=== qmi-failover run ==="

find_qmi_iface()
{
	local w n
	for w in /sys/class/net/wwan*; do
		[ -d "$w" ] || continue
		n="${w##*/}"
		[ -f "/sys/class/net/$n/qmi/raw_ip" ] && echo "$n" && return 0
	done
	for w in /sys/class/net/*; do
		[ -d "$w" ] || continue
		n="${w##*/}"
		[ -f "/sys/class/net/$n/qmi/raw_ip" ] && echo "$n" && return 0
	done
	return 1
}

IFACE="$(find_qmi_iface 2>/dev/null || true)"

get_qmi_network_status()
{
	/usr/bin/qmi-network "$DEV" status 2>/dev/null | /usr/bin/awk '/Status:/ {print $2}'
}

ping_ok()
{
	/usr/bin/ping -w 3 -I "$IFACE" "$PING_TARGET" >> "$LOG" 2>&1
	return $?
}

is_ps_barring()
{
	/usr/bin/qmicli -d "$DEV" --nas-get-serving-system 2>/dev/null | /usr/bin/grep -q "Packet switched: 'all-calls'"
}

load_fail_count()
{
	FAIL_COUNT=0
	[ -d "$STATE_DIR" ] || /bin/mkdir -p "$STATE_DIR" 2>/dev/null || true
	if [ -f "$STATE_FILE" ]; then
		FAIL_COUNT="$(/usr/bin/awk -F= '/^fail_count=/ {print $2}' "$STATE_FILE" 2>/dev/null)"
	fi
	case "$FAIL_COUNT" in
		''|*[!0-9]*) FAIL_COUNT=0 ;;
	esac
}

save_fail_count()
{
	/bin/mkdir -p "$STATE_DIR" 2>/dev/null || true
	/usr/bin/printf "fail_count=%s\n" "$FAIL_COUNT" > "$STATE_FILE"
}

reset_fail_count()
{
	FAIL_COUNT=0
	save_fail_count
}

record_failure_and_maybe_reboot()
{
	local reason="$1"
	FAIL_COUNT=$((FAIL_COUNT + 1))
	save_fail_count
	log "long-term failure count incremented to $FAIL_COUNT (reason: $reason)"

	if [ "$FAIL_COUNT" -ge "$REBOOT_THRESHOLD" ]; then
		log "failure threshold reached ($FAIL_COUNT >= $REBOOT_THRESHOLD); rebooting in ${REBOOT_DELAY_SEC}s"
		sleep "$REBOOT_DELAY_SEC"
		/bin/systemctl reboot >> "$LOG" 2>&1 || true
	fi
}

do_stop_start()
{
	log "restarting quectel-qmi (stop -> start)"
	/usr/lib/matelex/quectel-qmi stop >> "$LOG" 2>&1 || true
	sleep 2
	/usr/lib/matelex/quectel-qmi start >> "$LOG" 2>&1
	return $?
}

do_dms_reset_then_wait()
{
	local waited=0

	log "triggering dms-reset after ${MAX_TRIES_BEFORE_RESET} failed attempts"
	/usr/bin/qmicli -d "$DEV" --dms-reset >> "$LOG" 2>&1 || true

	log "polling up to ${RESET_POLL_MAX_SEC}s after reset (interval ${RESET_POLL_INTERVAL_SEC}s)"
	while [ $waited -lt $RESET_POLL_MAX_SEC ]; do
		# If network becomes connected and ping works, consider recovered.
		if [ "$(get_qmi_network_status)" = "connected" ] && ping_ok; then
			log "recovered during post-reset polling"
			return 0
		fi
		sleep $RESET_POLL_INTERVAL_SEC
		waited=$((waited + RESET_POLL_INTERVAL_SEC))
	done

	log "post-reset polling timed out; restarting quectel-qmi.service"
	/bin/systemctl restart quectel-qmi.service >> "$LOG" 2>&1 || true
	sleep 3
	if [ "$(get_qmi_network_status)" = "connected" ] && ping_ok; then
		log "recovered after service restart"
		return 0
	fi

	log "still not recovered after reset+restart"
	return 1
}

if [ ! -c "$DEV" ]; then
	log "no $DEV device node; cannot manage QMI, exiting"
	log "=== qmi-failover done (no device) ==="
	exit 0
fi

load_fail_count

if [ -z "$IFACE" ]; then
	log "no QMI interface (qmi/raw_ip) present; attempting stop/start once"
	do_stop_start
	log "=== qmi-failover done (no qmi-iface) ==="
	exit 0
fi

# PS barring: do not flap the modem; just log and wait for next cron run.
if is_ps_barring; then
	log "PS barring detected (Packet switched: all-calls). No restart/reset; waiting for next run."
	record_failure_and_maybe_reboot "ps-barring"
	log "=== qmi-failover done (ps-barring) ==="
	exit 0
fi

tries=0
while [ $tries -lt $MAX_TRIES_BEFORE_RESET ]; do
	log "ping check on $IFACE to $PING_TARGET (attempt $((tries + 1))/$MAX_TRIES_BEFORE_RESET)"

	ping_ok
	ping_rc=$?
	status="$(get_qmi_network_status)"
	log "qmi-network status: ${status:-unknown}; ping_rc=$ping_rc"

	if [ "$status" = "connected" ] && [ $ping_rc -eq 0 ]; then
		log "ping ok and status connected; no action"
		reset_fail_count
		log "=== qmi-failover done (ok) ==="
		exit 0
	fi

	tries=$((tries + 1))
	log "failure detected (need status=connected and ping ok). stop/start attempt $tries"
	do_stop_start
	sleep 2
done

do_dms_reset_then_wait
if [ $? -eq 0 ]; then
	reset_fail_count
	log "=== qmi-failover done (recovered) ==="
	exit 0
fi

record_failure_and_maybe_reboot "not-recovered"
log "=== qmi-failover done (failed) ==="
exit 0