diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index 80a838b..c417ba1 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -275,75 +275,110 @@ jobs: MONITOR_PID=$! echo "๐Ÿ” [HANG DEBUG] Started background monitor (PID: $MONITOR_PID)" - # Run pytest with timeout and enhanced logging - echo "๐Ÿš€ [HANG DEBUG] Starting pytest with 600s timeout..." + # Run pytest with enhanced real-time monitoring (no dependency on pytest logs) + echo "๐Ÿš€ [HANG DEBUG] Starting pytest with 600s timeout and external monitoring..." - # Pre-test import debugging to identify hang point - echo "๐Ÿ” [HANG DEBUG] Testing individual imports to identify hang source..." - timeout 120 python -c " - import sys - import time - def timed_import(module, desc): - start = time.time() - print(f'โณ Importing {desc}...', flush=True) - try: - __import__(module) - elapsed = time.time() - start - print(f'โœ… {desc} imported in {elapsed:.2f}s', flush=True) - except Exception as e: - elapsed = time.time() - start - print(f'โŒ {desc} failed after {elapsed:.2f}s: {e}', flush=True) - raise + # Start independent process monitor that doesn't rely on pytest output + external_monitor() { + local pytest_pid=$1 + local start_time=$(date +%s) + local last_cpu_check=0 + local stable_count=0 - print('๐Ÿงช Testing individual module imports...', flush=True) - timed_import('leann', 'Core LEANN package') - timed_import('leann.api', 'LEANN API module') + while true; do + sleep 5 + current_time=$(date +%s) + elapsed=$((current_time - start_time)) - print('๐Ÿงช Testing backend imports (most likely hang point)...', flush=True) - timed_import('leann_backend_hnsw', 'HNSW backend package') - timed_import('leann_backend_diskann', 'DiskANN backend package') - - print('๐Ÿงช Testing deep imports...', flush=True) - timed_import('leann_backend_hnsw.hnsw_backend', 'HNSW backend module') - timed_import('leann_backend_diskann.diskann_backend', 'DiskANN backend module') - - print('โœ… All imports completed successfully', flush=True) - " 2>&1 | while IFS= read -r line; do - echo "$(date +"%H:%M:%S") [IMPORT] $line" - done - IMPORT_EXIT=$? - - if [ $IMPORT_EXIT -eq 124 ]; then - echo "โš ๏ธ [HANG DEBUG] IMPORT TIMEOUT! Import process hung - this is likely the root cause" - echo "๐Ÿ’€ [HANG DEBUG] Killing any hanging Python processes..." - pkill -KILL -f python || true - exit 1 - elif [ $IMPORT_EXIT -ne 0 ]; then - echo "โŒ [HANG DEBUG] Import tests failed with exit code: $IMPORT_EXIT" - exit $IMPORT_EXIT - fi - - echo "โœ… [HANG DEBUG] Import tests passed, proceeding with pytest..." - timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c ' - echo "โ–ถ๏ธ [HANG DEBUG] Pytest starting at: $(date)" - pytest tests/ -v --tb=short --maxfail=5 -x 2>&1 | while IFS= read -r line; do - echo "$(date +"%H:%M:%S") [PYTEST] $line" - done - echo "โœ… [HANG DEBUG] Pytest completed at: $(date)" - - # Monitor post-pytest cleanup for 30 seconds - echo "๐Ÿงน [HANG DEBUG] Monitoring post-pytest cleanup..." - for i in {1..30}; do - echo "๐Ÿ” [HANG DEBUG] Cleanup check $i/30 at $(date)" - ps aux | grep -E "(python|pytest|embedding)" | grep -v grep | head -5 - if [ $(ps aux | grep -E "(python|pytest)" | grep -v grep | wc -l) -eq 0 ]; then - echo "โœ… [HANG DEBUG] All Python processes cleaned up successfully" + # Check if pytest process still exists + if ! kill -0 $pytest_pid 2>/dev/null; then + echo "๐Ÿ“Š [EXTERNAL] $(date): Pytest process $pytest_pid ended after ${elapsed}s" + break + fi + + # Get detailed process info + ps_info=$(ps -p $pytest_pid -o pid,ppid,time,pcpu,pmem,state,comm 2>/dev/null || echo "PROCESS_GONE") + if [ "$ps_info" != "PROCESS_GONE" ]; then + echo "๐Ÿ“Š [EXTERNAL] $(date): Process $pytest_pid - ${ps_info}" + + # Extract CPU percentage and check for stability + current_cpu=$(echo "$ps_info" | tail -1 | awk '{print $4}' | cut -d. -f1) + if [ "$current_cpu" = "$last_cpu_check" ] && [ "$current_cpu" -lt 5 ]; then + stable_count=$((stable_count + 1)) + if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU + echo "โš ๏ธ [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s" + fi + else + stable_count=0 + fi + last_cpu_check=$current_cpu + + # Check for zombie/stopped state + state=$(echo "$ps_info" | tail -1 | awk '{print $6}') + if [ "$state" = "Z" ] || [ "$state" = "T" ]; then + echo "๐Ÿ’€ [EXTERNAL] $(date): Process in abnormal state: $state" + fi + fi + + # Check for orphaned Python processes + orphan_count=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l) + if [ $orphan_count -gt 1 ]; then + echo "๐Ÿ” [EXTERNAL] $(date): Found $orphan_count pytest-related processes" + ps aux | grep -E "python.*pytest" | grep -v grep + fi + + # Emergency timeout + if [ $elapsed -gt 650 ]; then + echo "๐Ÿ’ฅ [EXTERNAL] $(date): Emergency timeout reached, force killing pytest" + kill -KILL $pytest_pid 2>/dev/null || true break fi - sleep 1 done - echo "๐Ÿ [HANG DEBUG] Cleanup monitoring finished" - ' + } + + # Run pytest in background so we can monitor it externally + python -u -c " + import sys, time + print(f'๐Ÿ” [REALTIME] {time.strftime(\"%H:%M:%S\")} Starting pytest...', flush=True) + " + timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c ' + echo "โ–ถ๏ธ [HANG DEBUG] Pytest starting at: $(date)" + # Force unbuffered output and immediate flush + stdbuf -o0 -e0 pytest tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do + printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line" + # Force flush after each line + sync + done + PYTEST_RESULT=${PIPESTATUS[0]} + echo "โœ… [HANG DEBUG] Pytest completed at: $(date) with exit code: $PYTEST_RESULT" + exit $PYTEST_RESULT + ' & + PYTEST_PID=$! + echo "๐Ÿ” [HANG DEBUG] Pytest started with PID: $PYTEST_PID" + + # Start external monitoring + external_monitor $PYTEST_PID & + EXTERNAL_MONITOR_PID=$! + + # Wait for pytest to complete + wait $PYTEST_PID + PYTEST_EXIT=$? + echo "๐Ÿ [HANG DEBUG] Pytest process ended with exit code: $PYTEST_EXIT" + + # Stop external monitor + kill $EXTERNAL_MONITOR_PID 2>/dev/null || true + + # Final cleanup check + echo "๐Ÿงน [HANG DEBUG] Final cleanup check..." + REMAINING_PROCS=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l) + if [ $REMAINING_PROCS -gt 0 ]; then + echo "โš ๏ธ [HANG DEBUG] Found $REMAINING_PROCS remaining pytest processes after completion" + ps aux | grep -E "python.*pytest" | grep -v grep + echo "๐Ÿ’€ [HANG DEBUG] Force killing remaining processes..." + ps aux | grep -E "python.*pytest" | grep -v grep | awk "{print \$2}" | xargs -r kill -KILL + else + echo "โœ… [HANG DEBUG] No remaining pytest processes found" + fi PYTEST_EXIT=$? # Stop background monitoring