perf(graph): cache merged SDL and SchemaUpdate per ref
schemas / vulnerabilities (pull_request) Successful in 2m8s
schemas / check (pull_request) Successful in 3m5s
schemas / check-release (pull_request) Successful in 5m14s
pre-commit / pre-commit (pull_request) Successful in 6m55s
schemas / build (pull_request) Successful in 5m44s
schemas / deploy-prod (pull_request) Has been skipped

Both Supergraph and LatestSchema resolvers recomputed their result on
every request. The work is non-trivial:

- Supergraph: sdlmerge.MergeSDLs() runs AST validation + normalization
  + custom merge walkers over all subgraph SDLs.
- LatestSchema: CosmoGenerator.Generate() shells out to wgc router
  compose (Node via npx), spending 100-300m CPU per call.

Because the output is fully determined by the set of subgraph SDLs and
their lastUpdate timestamp, the result can be cached and reused across
requests until a SubGraphUpdated event bumps the lastUpdate for the
(orgId, ref) key.

Add two precomputation caches to cache.Cache, both versioned by the
existing lastUpdate map so a single timestamp comparison invalidates
stale entries implicitly:

- mergedSDLs: cached MergeSDLs output for Supergraph
- schemaUpdates: cached SchemaUpdate (subgraphs + cosmo config) for
  LatestSchema

The UpdateSubGraph debounce already computes the cosmo config to
publish through PubSub; it now also stores the SchemaUpdate so the
next LatestSchema query is warm. OrganizationRemoved evicts both
caches alongside lastUpdate.

This eliminates the per-request CPU bursts that were tripping the
HPA into TooManyReplicas territory.
This commit is contained in:
2026-05-19 09:37:43 +02:00
parent 9a4b05d897
commit d652c1e446
2 changed files with 100 additions and 3 deletions
+26 -1
View File
@@ -210,6 +210,7 @@ func (r *mutationResolver) UpdateSubGraph(ctx context.Context, input model.Input
SubGraphs: subGraphs,
CosmoRouterConfig: &cosmoConfig,
}
r.Cache.SetSchemaUpdate(orgId, input.Ref, update)
r.Logger.Info(
"Publishing schema update to subscribers",
@@ -280,13 +281,25 @@ func (r *queryResolver) Supergraph(ctx context.Context, ref string, isAfter *str
if isAfter != nil {
after = *isAfter
}
services, lastUpdate := r.Cache.Services(orgId, ref, after)
_, lastUpdate := r.Cache.Services(orgId, ref, after)
if after == lastUpdate {
return &model.Unchanged{
ID: lastUpdate,
MinDelaySeconds: 10,
}, nil
}
if cached := r.Cache.GetMergedSDL(orgId, ref); cached != nil {
id, sdl, subGraphs := cached.Unpack()
return &model.SubGraphs{
ID: id,
SubGraphs: subGraphs,
Sdl: sdl,
MinDelaySeconds: 10,
}, nil
}
services, _ := r.Cache.Services(orgId, ref, "")
subGraphs := make([]*model.SubGraph, len(services))
serviceSDLs := make([]string, len(services))
for i, id := range services {
@@ -302,6 +315,7 @@ func (r *queryResolver) Supergraph(ctx context.Context, ref string, isAfter *str
if err != nil {
return nil, err
}
r.Cache.SetMergedSDL(orgId, ref, lastUpdate, sdl, subGraphs)
return &model.SubGraphs{
ID: lastUpdate,
SubGraphs: subGraphs,
@@ -344,6 +358,16 @@ func (r *queryResolver) LatestSchema(ctx context.Context, ref string) (*model.Sc
return nil, fmt.Errorf("no authentication provided")
}
if cached := r.Cache.GetSchemaUpdate(orgId, ref); cached != nil {
r.Logger.Info(
"Latest schema served from cache",
"ref", ref,
"orgId", orgId,
"id", cached.ID,
)
return cached, nil
}
// Get current services and schema
services, lastUpdate := r.Cache.Services(orgId, ref, "")
r.Logger.Info(
@@ -385,6 +409,7 @@ func (r *queryResolver) LatestSchema(ctx context.Context, ref string) (*model.Sc
SubGraphs: subGraphs,
CosmoRouterConfig: &cosmoConfig,
}
r.Cache.SetSchemaUpdate(orgId, ref, update)
r.Logger.Info(
"Latest schema fetched",