perf(graph): cache merged SDL and SchemaUpdate per ref

Both Supergraph and LatestSchema resolvers recomputed their result on every request. The work is non-trivial: - Supergraph: sdlmerge.MergeSDLs() runs AST validation + normalization + custom merge walkers over all subgraph SDLs. - LatestSchema: CosmoGenerator.Generate() shells out to wgc router compose (Node via npx), spending 100-300m CPU per call. Because the output is fully determined by the set of subgraph SDLs and their lastUpdate timestamp, the result can be cached and reused across requests until a SubGraphUpdated event bumps the lastUpdate for the (orgId, ref) key. Add two precomputation caches to cache.Cache, both versioned by the existing lastUpdate map so a single timestamp comparison invalidates stale entries implicitly: - mergedSDLs: cached MergeSDLs output for Supergraph - schemaUpdates: cached SchemaUpdate (subgraphs + cosmo config) for LatestSchema The UpdateSubGraph debounce already computes the cosmo config to publish through PubSub; it now also stores the SchemaUpdate so the next LatestSchema query is warm. OrganizationRemoved evicts both caches alongside lastUpdate. This eliminates the per-request CPU bursts that were tripping the HPA into TooManyReplicas territory.
2026-05-19 09:37:43 +02:00
parent 9a4b05d897
commit d652c1e446
2 changed files with 100 additions and 3 deletions
@@ -210,6 +210,7 @@ func (r *mutationResolver) UpdateSubGraph(ctx context.Context, input model.Input
 			SubGraphs:         subGraphs,
 			CosmoRouterConfig: &cosmoConfig,
 		}
+		r.Cache.SetSchemaUpdate(orgId, input.Ref, update)

 		r.Logger.Info(
 			"Publishing schema update to subscribers",
@@ -280,13 +281,25 @@ func (r *queryResolver) Supergraph(ctx context.Context, ref string, isAfter *str
 	if isAfter != nil {
 		after = *isAfter
 	}
-	services, lastUpdate := r.Cache.Services(orgId, ref, after)
+	_, lastUpdate := r.Cache.Services(orgId, ref, after)
 	if after == lastUpdate {
 		return &model.Unchanged{
 			ID:              lastUpdate,
 			MinDelaySeconds: 10,
 		}, nil
 	}
+
+	if cached := r.Cache.GetMergedSDL(orgId, ref); cached != nil {
+		id, sdl, subGraphs := cached.Unpack()
+		return &model.SubGraphs{
+			ID:              id,
+			SubGraphs:       subGraphs,
+			Sdl:             sdl,
+			MinDelaySeconds: 10,
+		}, nil
+	}
+
+	services, _ := r.Cache.Services(orgId, ref, "")
 	subGraphs := make([]*model.SubGraph, len(services))
 	serviceSDLs := make([]string, len(services))
 	for i, id := range services {
@@ -302,6 +315,7 @@ func (r *queryResolver) Supergraph(ctx context.Context, ref string, isAfter *str
 	if err != nil {
 		return nil, err
 	}
+	r.Cache.SetMergedSDL(orgId, ref, lastUpdate, sdl, subGraphs)
 	return &model.SubGraphs{
 		ID:              lastUpdate,
 		SubGraphs:       subGraphs,
@@ -344,6 +358,16 @@ func (r *queryResolver) LatestSchema(ctx context.Context, ref string) (*model.Sc
 		return nil, fmt.Errorf("no authentication provided")
 	}

+	if cached := r.Cache.GetSchemaUpdate(orgId, ref); cached != nil {
+		r.Logger.Info(
+			"Latest schema served from cache",
+			"ref", ref,
+			"orgId", orgId,
+			"id", cached.ID,
+		)
+		return cached, nil
+	}
+
 	// Get current services and schema
 	services, lastUpdate := r.Cache.Services(orgId, ref, "")
 	r.Logger.Info(
@@ -385,6 +409,7 @@ func (r *queryResolver) LatestSchema(ctx context.Context, ref string) (*model.Sc
 		SubGraphs:         subGraphs,
 		CosmoRouterConfig: &cosmoConfig,
 	}
+	r.Cache.SetSchemaUpdate(orgId, ref, update)

 	r.Logger.Info(
 		"Latest schema fetched",