fix: prevent OOM on rapid schema publishing

Add concurrency-limited CosmoGenerator (semaphore limit=1, 60s timeout) to prevent unbounded concurrent wgc process spawning. Add debouncer (500ms) to coalesce rapid schema updates per org+ref. Fix double subgraph fetch in Supergraph resolver and goroutine leak in SchemaUpdates subscription. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 08:05:47 +01:00
parent a9885f8b65
commit 28aa32ad8c
8 changed files with 283 additions and 60 deletions
@@ -1,11 +1,15 @@
 package graph

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"testing"
+	"time"

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -459,6 +463,114 @@ func TestGenerateCosmoRouterConfig_MockError(t *testing.T) {
 	assert.Equal(t, 1, mockExecutor.CallCount, "Should have attempted to call executor")
 }

+// SlowMockExecutor simulates a slow wgc command for concurrency testing.
+type SlowMockExecutor struct {
+	MockCommandExecutor
+	delay      time.Duration
+	mu         sync.Mutex
+	concurrent atomic.Int32
+	maxSeen    atomic.Int32
+}
+
+func (m *SlowMockExecutor) Execute(name string, args ...string) ([]byte, error) {
+	cur := m.concurrent.Add(1)
+	// Track the maximum concurrent executions observed.
+	for {
+		old := m.maxSeen.Load()
+		if cur <= old || m.maxSeen.CompareAndSwap(old, cur) {
+			break
+		}
+	}
+	defer m.concurrent.Add(-1)
+
+	time.Sleep(m.delay)
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.MockCommandExecutor.Execute(name, args...)
+}
+
+func TestCosmoGenerator_ConcurrencyLimit(t *testing.T) {
+	executor := &SlowMockExecutor{delay: 100 * time.Millisecond}
+	gen := NewCosmoGenerator(executor, 5*time.Second)
+
+	subGraphs := []*model.SubGraph{
+		{
+			Service: "svc",
+			URL:     stringPtr("http://localhost:4001/query"),
+			Sdl:     "type Query { hello: String }",
+		},
+	}
+
+	var wg sync.WaitGroup
+	for range 5 {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			_, _ = gen.Generate(context.Background(), subGraphs)
+		}()
+	}
+	wg.Wait()
+
+	assert.Equal(t, int32(1), executor.maxSeen.Load(),
+		"at most 1 wgc process should run concurrently")
+}
+
+func TestCosmoGenerator_Timeout(t *testing.T) {
+	// Executor that takes longer than the timeout.
+	executor := &SlowMockExecutor{delay: 500 * time.Millisecond}
+	gen := NewCosmoGenerator(executor, 50*time.Millisecond)
+
+	subGraphs := []*model.SubGraph{
+		{
+			Service: "svc",
+			URL:     stringPtr("http://localhost:4001/query"),
+			Sdl:     "type Query { hello: String }",
+		},
+	}
+
+	// First call: occupies the semaphore for 500ms.
+	go func() {
+		_, _ = gen.Generate(context.Background(), subGraphs)
+	}()
+
+	// Give the first goroutine time to acquire the semaphore.
+	time.Sleep(20 * time.Millisecond)
+
+	// Second call: should timeout waiting for the semaphore.
+	_, err := gen.Generate(context.Background(), subGraphs)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "acquire cosmo generator")
+}
+
+func TestCosmoGenerator_ContextCancellation(t *testing.T) {
+	executor := &SlowMockExecutor{delay: 500 * time.Millisecond}
+	gen := NewCosmoGenerator(executor, 5*time.Second)
+
+	subGraphs := []*model.SubGraph{
+		{
+			Service: "svc",
+			URL:     stringPtr("http://localhost:4001/query"),
+			Sdl:     "type Query { hello: String }",
+		},
+	}
+
+	// First call: occupies the semaphore.
+	go func() {
+		_, _ = gen.Generate(context.Background(), subGraphs)
+	}()
+
+	time.Sleep(20 * time.Millisecond)
+
+	// Second call with an already-cancelled context.
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	_, err := gen.Generate(ctx, subGraphs)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "acquire cosmo generator")
+}
+
 // Helper function for tests
 func stringPtr(s string) *string {
 	return &s