debug: add external process monitoring and unbuffered output for precise hang detection

This commit is contained in:
Andy Lee
2025-08-12 11:27:37 -07:00
parent aa2002dc3a
commit ffbf0282c3

View File

@@ -275,75 +275,110 @@ jobs:
MONITOR_PID=$! MONITOR_PID=$!
echo "🔍 [HANG DEBUG] Started background monitor (PID: $MONITOR_PID)" echo "🔍 [HANG DEBUG] Started background monitor (PID: $MONITOR_PID)"
# Run pytest with timeout and enhanced logging # Run pytest with enhanced real-time monitoring (no dependency on pytest logs)
echo "🚀 [HANG DEBUG] Starting pytest with 600s timeout..." echo "🚀 [HANG DEBUG] Starting pytest with 600s timeout and external monitoring..."
# Pre-test import debugging to identify hang point # Start independent process monitor that doesn't rely on pytest output
echo "🔍 [HANG DEBUG] Testing individual imports to identify hang source..." external_monitor() {
timeout 120 python -c " local pytest_pid=$1
import sys local start_time=$(date +%s)
import time local last_cpu_check=0
def timed_import(module, desc): local stable_count=0
start = time.time()
print(f'⏳ Importing {desc}...', flush=True)
try:
__import__(module)
elapsed = time.time() - start
print(f'✅ {desc} imported in {elapsed:.2f}s', flush=True)
except Exception as e:
elapsed = time.time() - start
print(f'❌ {desc} failed after {elapsed:.2f}s: {e}', flush=True)
raise
print('🧪 Testing individual module imports...', flush=True) while true; do
timed_import('leann', 'Core LEANN package') sleep 5
timed_import('leann.api', 'LEANN API module') current_time=$(date +%s)
elapsed=$((current_time - start_time))
print('🧪 Testing backend imports (most likely hang point)...', flush=True) # Check if pytest process still exists
timed_import('leann_backend_hnsw', 'HNSW backend package') if ! kill -0 $pytest_pid 2>/dev/null; then
timed_import('leann_backend_diskann', 'DiskANN backend package') echo "📊 [EXTERNAL] $(date): Pytest process $pytest_pid ended after ${elapsed}s"
break
print('🧪 Testing deep imports...', flush=True) fi
timed_import('leann_backend_hnsw.hnsw_backend', 'HNSW backend module')
timed_import('leann_backend_diskann.diskann_backend', 'DiskANN backend module') # Get detailed process info
ps_info=$(ps -p $pytest_pid -o pid,ppid,time,pcpu,pmem,state,comm 2>/dev/null || echo "PROCESS_GONE")
print('✅ All imports completed successfully', flush=True) if [ "$ps_info" != "PROCESS_GONE" ]; then
" 2>&1 | while IFS= read -r line; do echo "📊 [EXTERNAL] $(date): Process $pytest_pid - ${ps_info}"
echo "$(date +"%H:%M:%S") [IMPORT] $line"
done # Extract CPU percentage and check for stability
IMPORT_EXIT=$? current_cpu=$(echo "$ps_info" | tail -1 | awk '{print $4}' | cut -d. -f1)
if [ "$current_cpu" = "$last_cpu_check" ] && [ "$current_cpu" -lt 5 ]; then
if [ $IMPORT_EXIT -eq 124 ]; then stable_count=$((stable_count + 1))
echo "⚠️ [HANG DEBUG] IMPORT TIMEOUT! Import process hung - this is likely the root cause" if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU
echo "💀 [HANG DEBUG] Killing any hanging Python processes..." echo "⚠️ [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s"
pkill -KILL -f python || true fi
exit 1 else
elif [ $IMPORT_EXIT -ne 0 ]; then stable_count=0
echo "❌ [HANG DEBUG] Import tests failed with exit code: $IMPORT_EXIT" fi
exit $IMPORT_EXIT last_cpu_check=$current_cpu
fi
# Check for zombie/stopped state
echo "✅ [HANG DEBUG] Import tests passed, proceeding with pytest..." state=$(echo "$ps_info" | tail -1 | awk '{print $6}')
timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c ' if [ "$state" = "Z" ] || [ "$state" = "T" ]; then
echo "▶️ [HANG DEBUG] Pytest starting at: $(date)" echo "💀 [EXTERNAL] $(date): Process in abnormal state: $state"
pytest tests/ -v --tb=short --maxfail=5 -x 2>&1 | while IFS= read -r line; do fi
echo "$(date +"%H:%M:%S") [PYTEST] $line" fi
done
echo "✅ [HANG DEBUG] Pytest completed at: $(date)" # Check for orphaned Python processes
orphan_count=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l)
# Monitor post-pytest cleanup for 30 seconds if [ $orphan_count -gt 1 ]; then
echo "🧹 [HANG DEBUG] Monitoring post-pytest cleanup..." echo "🔍 [EXTERNAL] $(date): Found $orphan_count pytest-related processes"
for i in {1..30}; do ps aux | grep -E "python.*pytest" | grep -v grep
echo "🔍 [HANG DEBUG] Cleanup check $i/30 at $(date)" fi
ps aux | grep -E "(python|pytest|embedding)" | grep -v grep | head -5
if [ $(ps aux | grep -E "(python|pytest)" | grep -v grep | wc -l) -eq 0 ]; then # Emergency timeout
echo "✅ [HANG DEBUG] All Python processes cleaned up successfully" if [ $elapsed -gt 650 ]; then
echo "💥 [EXTERNAL] $(date): Emergency timeout reached, force killing pytest"
kill -KILL $pytest_pid 2>/dev/null || true
break break
fi fi
sleep 1
done done
echo "🏁 [HANG DEBUG] Cleanup monitoring finished" }
'
# Run pytest in background so we can monitor it externally
python -u -c "
import sys, time
print(f'🔍 [REALTIME] {time.strftime(\"%H:%M:%S\")} Starting pytest...', flush=True)
"
timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c '
echo "▶️ [HANG DEBUG] Pytest starting at: $(date)"
# Force unbuffered output and immediate flush
stdbuf -o0 -e0 pytest tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do
printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line"
# Force flush after each line
sync
done
PYTEST_RESULT=${PIPESTATUS[0]}
echo "✅ [HANG DEBUG] Pytest completed at: $(date) with exit code: $PYTEST_RESULT"
exit $PYTEST_RESULT
' &
PYTEST_PID=$!
echo "🔍 [HANG DEBUG] Pytest started with PID: $PYTEST_PID"
# Start external monitoring
external_monitor $PYTEST_PID &
EXTERNAL_MONITOR_PID=$!
# Wait for pytest to complete
wait $PYTEST_PID
PYTEST_EXIT=$?
echo "🏁 [HANG DEBUG] Pytest process ended with exit code: $PYTEST_EXIT"
# Stop external monitor
kill $EXTERNAL_MONITOR_PID 2>/dev/null || true
# Final cleanup check
echo "🧹 [HANG DEBUG] Final cleanup check..."
REMAINING_PROCS=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l)
if [ $REMAINING_PROCS -gt 0 ]; then
echo "⚠️ [HANG DEBUG] Found $REMAINING_PROCS remaining pytest processes after completion"
ps aux | grep -E "python.*pytest" | grep -v grep
echo "💀 [HANG DEBUG] Force killing remaining processes..."
ps aux | grep -E "python.*pytest" | grep -v grep | awk "{print \$2}" | xargs -r kill -KILL
else
echo "✅ [HANG DEBUG] No remaining pytest processes found"
fi
PYTEST_EXIT=$? PYTEST_EXIT=$?
# Stop background monitoring # Stop background monitoring