Skip to content

Commit 8dc4a0c

Browse files
committed
chainrpc: retry notifier RPCs during startup lag
lnd v0.20.0-rc3 delays ChainNotifier startup which causes Loop to hit "chain notifier RPC is still in the process of starting" during initial subscriptions (LND commit c6f458e478f9ef2cf1d394972bfbc512862c6707). Add a shared retry helper in lndclient so block epoch, confirmation and spend registrations transparently retry until the sub-server is ready, along with regression tests covering the behavior. With lnd PR lightningnetwork/lnd#10352 the server now returns gRPC codes.Unavailable instead of codes.Unknown, so the helper accepts either signal (status code or a string).
1 parent 6cbaf58 commit 8dc4a0c

File tree

2 files changed

+372
-15
lines changed

2 files changed

+372
-15
lines changed

chainnotifier_client.go

Lines changed: 130 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package lndclient
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"sync"
78
"time"
89

@@ -11,6 +12,8 @@ import (
1112
"github.com/lightningnetwork/lnd/chainntnfs"
1213
"github.com/lightningnetwork/lnd/lnrpc/chainrpc"
1314
"google.golang.org/grpc"
15+
"google.golang.org/grpc/codes"
16+
"google.golang.org/grpc/status"
1417
)
1518

1619
// NotifierOptions is a set of functional options that allow callers to further
@@ -38,6 +41,19 @@ func DefaultNotifierOptions() *NotifierOptions {
3841
// events received from the notifier.
3942
type NotifierOption func(*NotifierOptions)
4043

44+
const (
45+
// chainNotifierStartupMessage matches the error string returned by lnd
46+
// v0.20.0-rc3+ when a ChainNotifier RPC is invoked before the
47+
// sub-server finishes initialization.
48+
chainNotifierStartupMessage = "chain notifier RPC is still in the " +
49+
"process of starting"
50+
51+
// chainNotifierRetryBackoff defines the delay between successive
52+
// subscription attempts while waiting for the ChainNotifier sub-server
53+
// to become operational.
54+
chainNotifierRetryBackoff = 500 * time.Millisecond
55+
)
56+
4157
// WithIncludeBlock is an optional argument that allows the caller to specify
4258
// that the block that mined a transaction should be included in the response.
4359
func WithIncludeBlock() NotifierOption {
@@ -133,11 +149,23 @@ func (s *chainNotifierClient) RegisterSpendNtfn(ctx context.Context,
133149
}
134150
}
135151

136-
macaroonAuth := s.chainMac.WithMacaroonAuth(ctx)
137-
resp, err := s.client.RegisterSpendNtfn(macaroonAuth, &chainrpc.SpendRequest{
138-
HeightHint: uint32(heightHint),
139-
Outpoint: rpcOutpoint,
140-
Script: pkScript,
152+
var (
153+
resp chainrpc.ChainNotifier_RegisterSpendNtfnClient
154+
err error
155+
)
156+
157+
// lnd v0.20.0-rc3 changed the startup ordering so the ChainNotifier
158+
// sub-server can report "still starting" for a short window. Retry the
159+
// registration in that case to avoid aborting clients that subscribe
160+
// immediately at startup.
161+
err = s.retryChainNotifierCall(ctx, func() error {
162+
macaroonAuth := s.chainMac.WithMacaroonAuth(ctx)
163+
resp, err = s.client.RegisterSpendNtfn(macaroonAuth, &chainrpc.SpendRequest{
164+
HeightHint: uint32(heightHint),
165+
Outpoint: rpcOutpoint,
166+
Script: pkScript,
167+
})
168+
return err
141169
})
142170
if err != nil {
143171
return nil, nil, err
@@ -251,15 +279,25 @@ func (s *chainNotifierClient) RegisterConfirmationsNtfn(ctx context.Context,
251279
if txid != nil {
252280
txidSlice = txid[:]
253281
}
254-
confStream, err := s.client.RegisterConfirmationsNtfn(
255-
s.chainMac.WithMacaroonAuth(ctx), &chainrpc.ConfRequest{
256-
Script: pkScript,
257-
NumConfs: uint32(numConfs),
258-
HeightHint: uint32(heightHint),
259-
Txid: txidSlice,
260-
IncludeBlock: opts.IncludeBlock,
261-
},
282+
var (
283+
confStream chainrpc.ChainNotifier_RegisterConfirmationsNtfnClient
284+
err error
262285
)
286+
// The confirmation RPC is also subject to the post-v0.20.0-rc3 startup
287+
// ordering change, so we retry here until lnd reports the sub-server
288+
// ready.
289+
err = s.retryChainNotifierCall(ctx, func() error {
290+
confStream, err = s.client.RegisterConfirmationsNtfn(
291+
s.chainMac.WithMacaroonAuth(ctx), &chainrpc.ConfRequest{
292+
Script: pkScript,
293+
NumConfs: uint32(numConfs),
294+
HeightHint: uint32(heightHint),
295+
Txid: txidSlice,
296+
IncludeBlock: opts.IncludeBlock,
297+
},
298+
)
299+
return err
300+
})
263301
if err != nil {
264302
return nil, nil, err
265303
}
@@ -362,9 +400,18 @@ func (s *chainNotifierClient) RegisterConfirmationsNtfn(ctx context.Context,
362400
func (s *chainNotifierClient) RegisterBlockEpochNtfn(ctx context.Context) (
363401
chan int32, chan error, error) {
364402

365-
blockEpochClient, err := s.client.RegisterBlockEpochNtfn(
366-
s.chainMac.WithMacaroonAuth(ctx), &chainrpc.BlockEpoch{},
403+
var (
404+
blockEpochClient chainrpc.ChainNotifier_RegisterBlockEpochNtfnClient
405+
err error
367406
)
407+
// Block epoch subscriptions similarly need to survive the "still
408+
// starting" period introduced in lnd v0.20.0-rc3.
409+
err = s.retryChainNotifierCall(ctx, func() error {
410+
blockEpochClient, err = s.client.RegisterBlockEpochNtfn(
411+
s.chainMac.WithMacaroonAuth(ctx), &chainrpc.BlockEpoch{},
412+
)
413+
return err
414+
})
368415
if err != nil {
369416
return nil, nil, err
370417
}
@@ -393,3 +440,71 @@ func (s *chainNotifierClient) RegisterBlockEpochNtfn(ctx context.Context) (
393440

394441
return blockEpochChan, blockErrorChan, nil
395442
}
443+
444+
// retryChainNotifierCall executes the passed RPC invocation, retrying while
445+
// lnd reports that the ChainNotifier sub-server is still initialising.
446+
//
447+
// Prior to v0.20.0-rc3 the ChainNotifier sub-server finished initialization
448+
// before dependent services started, so a single RPC attempt succeeded. From
449+
// rc3 (LND commit c6f458e478f9ef2cf1d394972bfbc512862c6707) onwards lnd starts
450+
// the notifier later in the daemon lifecycle to avoid rescans from stale
451+
// heights. During the brief gap between client connection and notifier
452+
// readiness lnd returns the string "chain notifier RPC is still in the process
453+
// of starting" wrapped in an Unknown gRPC status. Clients that interact with
454+
// lnd immediately after it connects - such as Loop during integration testing -
455+
// would previously treat that error as fatal and abort startup, even though
456+
// retrying shortly after would succeed.
457+
//
458+
// This helper centralises the retry policy: when the specific "still starting"
459+
// error is encountered we back off briefly and reissue the RPC. Non-startup
460+
// errors are returned to the caller unchanged, and the caller's context
461+
// controls the overall deadline so shutdown conditions are respected.
462+
func (s *chainNotifierClient) retryChainNotifierCall(ctx context.Context,
463+
call func() error) error {
464+
465+
for {
466+
err := call()
467+
if err == nil {
468+
return nil
469+
}
470+
471+
if !isChainNotifierStartingErr(err) {
472+
return err
473+
}
474+
475+
log.Warnf("Chain notifier RPC not ready yet, retrying: %v", err)
476+
477+
select {
478+
case <-time.After(chainNotifierRetryBackoff):
479+
continue
480+
481+
case <-ctx.Done():
482+
return ctx.Err()
483+
}
484+
}
485+
}
486+
487+
// detectChainNotifierStartupError reports whether err is due to the lnd
488+
// ChainNotifier sub-server still starting up. Starting with lnd v0.20.0-rc3
489+
// the notifier is initialised later in the daemon lifecycle, and the RPC layer
490+
// surfaces this as an Unknown gRPC status that contains the message defined in
491+
// chainNotifierStartupMessage. There is a PR in LND to return code Unavailable
492+
// instead of Unknown: https://github.com/lightningnetwork/lnd/pull/10352
493+
func isChainNotifierStartingErr(err error) bool {
494+
if err == nil {
495+
return false
496+
}
497+
498+
// gRPC code Unavailable means "the server can't handle this request
499+
// now, retry later". LND's chain notifier returns this error when
500+
// the server is starting.
501+
// See https://github.com/lightningnetwork/lnd/pull/10352
502+
st, ok := status.FromError(err)
503+
if ok && st.Code() == codes.Unavailable {
504+
return true
505+
}
506+
507+
// TODO(ln-v0.20.0) remove the string fallback once lndclient depends on
508+
// a version of lnd that returns codes.Unavailable for this condition.
509+
return strings.Contains(err.Error(), chainNotifierStartupMessage)
510+
}

0 commit comments

Comments
 (0)