-
Notifications
You must be signed in to change notification settings - Fork 47
chainrpc: retry notifier RPCs during startup lag #253
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
starius
wants to merge
1
commit into
lightninglabs:master
Choose a base branch
from
starius:fix-chainnotifier-lnd-20-rc3
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ package lndclient | |
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strings" | ||
| "sync" | ||
| "time" | ||
|
|
||
|
|
@@ -11,6 +12,8 @@ import ( | |
| "github.com/lightningnetwork/lnd/chainntnfs" | ||
| "github.com/lightningnetwork/lnd/lnrpc/chainrpc" | ||
| "google.golang.org/grpc" | ||
| "google.golang.org/grpc/codes" | ||
| "google.golang.org/grpc/status" | ||
| ) | ||
|
|
||
| // NotifierOptions is a set of functional options that allow callers to further | ||
|
|
@@ -38,6 +41,19 @@ func DefaultNotifierOptions() *NotifierOptions { | |
| // events received from the notifier. | ||
| type NotifierOption func(*NotifierOptions) | ||
|
|
||
| const ( | ||
| // chainNotifierStartupMessage matches the error string returned by lnd | ||
| // v0.20.0-rc3+ when a ChainNotifier RPC is invoked before the | ||
| // sub-server finishes initialization. | ||
| chainNotifierStartupMessage = "chain notifier RPC is still in the " + | ||
| "process of starting" | ||
|
|
||
| // chainNotifierRetryBackoff defines the delay between successive | ||
| // subscription attempts while waiting for the ChainNotifier sub-server | ||
| // to become operational. | ||
| chainNotifierRetryBackoff = 500 * time.Millisecond | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Like that fixed-delay backoff value. Seems a good heuristic value 👍 |
||
| ) | ||
|
|
||
| // WithIncludeBlock is an optional argument that allows the caller to specify | ||
| // that the block that mined a transaction should be included in the response. | ||
| func WithIncludeBlock() NotifierOption { | ||
|
|
@@ -133,11 +149,23 @@ func (s *chainNotifierClient) RegisterSpendNtfn(ctx context.Context, | |
| } | ||
| } | ||
|
|
||
| macaroonAuth := s.chainMac.WithMacaroonAuth(ctx) | ||
| resp, err := s.client.RegisterSpendNtfn(macaroonAuth, &chainrpc.SpendRequest{ | ||
| HeightHint: uint32(heightHint), | ||
| Outpoint: rpcOutpoint, | ||
| Script: pkScript, | ||
| var ( | ||
| resp chainrpc.ChainNotifier_RegisterSpendNtfnClient | ||
| err error | ||
| ) | ||
|
|
||
| // lnd v0.20.0-rc3 changed the startup ordering so the ChainNotifier | ||
| // sub-server can report "still starting" for a short window. Retry the | ||
| // registration in that case to avoid aborting clients that subscribe | ||
| // immediately at startup. | ||
| err = s.retryChainNotifierCall(ctx, func() error { | ||
| macaroonAuth := s.chainMac.WithMacaroonAuth(ctx) | ||
| resp, err = s.client.RegisterSpendNtfn(macaroonAuth, &chainrpc.SpendRequest{ | ||
| HeightHint: uint32(heightHint), | ||
| Outpoint: rpcOutpoint, | ||
| Script: pkScript, | ||
| }) | ||
| return err | ||
| }) | ||
| if err != nil { | ||
| return nil, nil, err | ||
|
|
@@ -251,15 +279,25 @@ func (s *chainNotifierClient) RegisterConfirmationsNtfn(ctx context.Context, | |
| if txid != nil { | ||
| txidSlice = txid[:] | ||
| } | ||
| confStream, err := s.client.RegisterConfirmationsNtfn( | ||
| s.chainMac.WithMacaroonAuth(ctx), &chainrpc.ConfRequest{ | ||
| Script: pkScript, | ||
| NumConfs: uint32(numConfs), | ||
| HeightHint: uint32(heightHint), | ||
| Txid: txidSlice, | ||
| IncludeBlock: opts.IncludeBlock, | ||
| }, | ||
| var ( | ||
| confStream chainrpc.ChainNotifier_RegisterConfirmationsNtfnClient | ||
| err error | ||
| ) | ||
| // The confirmation RPC is also subject to the post-v0.20.0-rc3 startup | ||
| // ordering change, so we retry here until lnd reports the sub-server | ||
| // ready. | ||
| err = s.retryChainNotifierCall(ctx, func() error { | ||
| confStream, err = s.client.RegisterConfirmationsNtfn( | ||
| s.chainMac.WithMacaroonAuth(ctx), &chainrpc.ConfRequest{ | ||
| Script: pkScript, | ||
| NumConfs: uint32(numConfs), | ||
| HeightHint: uint32(heightHint), | ||
| Txid: txidSlice, | ||
| IncludeBlock: opts.IncludeBlock, | ||
| }, | ||
| ) | ||
| return err | ||
| }) | ||
| if err != nil { | ||
| return nil, nil, err | ||
| } | ||
|
|
@@ -362,9 +400,18 @@ func (s *chainNotifierClient) RegisterConfirmationsNtfn(ctx context.Context, | |
| func (s *chainNotifierClient) RegisterBlockEpochNtfn(ctx context.Context) ( | ||
| chan int32, chan error, error) { | ||
|
|
||
| blockEpochClient, err := s.client.RegisterBlockEpochNtfn( | ||
| s.chainMac.WithMacaroonAuth(ctx), &chainrpc.BlockEpoch{}, | ||
| var ( | ||
| blockEpochClient chainrpc.ChainNotifier_RegisterBlockEpochNtfnClient | ||
| err error | ||
| ) | ||
| // Block epoch subscriptions similarly need to survive the "still | ||
| // starting" period introduced in lnd v0.20.0-rc3. | ||
| err = s.retryChainNotifierCall(ctx, func() error { | ||
| blockEpochClient, err = s.client.RegisterBlockEpochNtfn( | ||
| s.chainMac.WithMacaroonAuth(ctx), &chainrpc.BlockEpoch{}, | ||
| ) | ||
| return err | ||
| }) | ||
| if err != nil { | ||
| return nil, nil, err | ||
| } | ||
|
|
@@ -393,3 +440,71 @@ func (s *chainNotifierClient) RegisterBlockEpochNtfn(ctx context.Context) ( | |
|
|
||
| return blockEpochChan, blockErrorChan, nil | ||
| } | ||
|
|
||
| // retryChainNotifierCall executes the passed RPC invocation, retrying while | ||
| // lnd reports that the ChainNotifier sub-server is still initialising. | ||
| // | ||
| // Prior to v0.20.0-rc3 the ChainNotifier sub-server finished initialization | ||
| // before dependent services started, so a single RPC attempt succeeded. From | ||
| // rc3 (LND commit c6f458e478f9ef2cf1d394972bfbc512862c6707) onwards lnd starts | ||
| // the notifier later in the daemon lifecycle to avoid rescans from stale | ||
| // heights. During the brief gap between client connection and notifier | ||
| // readiness lnd returns the string "chain notifier RPC is still in the process | ||
| // of starting" wrapped in an Unknown gRPC status. Clients that interact with | ||
| // lnd immediately after it connects - such as Loop during integration testing - | ||
| // would previously treat that error as fatal and abort startup, even though | ||
| // retrying shortly after would succeed. | ||
| // | ||
| // This helper centralises the retry policy: when the specific "still starting" | ||
| // error is encountered we back off briefly and reissue the RPC. Non-startup | ||
| // errors are returned to the caller unchanged, and the caller's context | ||
| // controls the overall deadline so shutdown conditions are respected. | ||
| func (s *chainNotifierClient) retryChainNotifierCall(ctx context.Context, | ||
| call func() error) error { | ||
|
|
||
| for { | ||
| err := call() | ||
| if err == nil { | ||
| return nil | ||
| } | ||
|
|
||
| if !isChainNotifierStartingErr(err) { | ||
| return err | ||
| } | ||
|
|
||
| log.Warnf("Chain notifier RPC not ready yet, retrying: %v", err) | ||
|
|
||
| select { | ||
| case <-time.After(chainNotifierRetryBackoff): | ||
| continue | ||
|
|
||
| case <-ctx.Done(): | ||
| return ctx.Err() | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // detectChainNotifierStartupError reports whether err is due to the lnd | ||
| // ChainNotifier sub-server still starting up. Starting with lnd v0.20.0-rc3 | ||
| // the notifier is initialised later in the daemon lifecycle, and the RPC layer | ||
| // surfaces this as an Unknown gRPC status that contains the message defined in | ||
| // chainNotifierStartupMessage. There is a PR in LND to return code Unavailable | ||
| // instead of Unknown: https://github.com/lightningnetwork/lnd/pull/10352 | ||
| func isChainNotifierStartingErr(err error) bool { | ||
| if err == nil { | ||
| return false | ||
| } | ||
|
|
||
| // gRPC code Unavailable means "the server can't handle this request | ||
| // now, retry later". LND's chain notifier returns this error when | ||
| // the server is starting. | ||
| // See https://github.com/lightningnetwork/lnd/pull/10352 | ||
| st, ok := status.FromError(err) | ||
| if ok && st.Code() == codes.Unavailable { | ||
| return true | ||
| } | ||
|
|
||
| // TODO(ln-v0.20.0) remove the string fallback once lndclient depends on | ||
| // a version of lnd that returns codes.Unavailable for this condition. | ||
| return strings.Contains(err.Error(), chainNotifierStartupMessage) | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
observation: It seems the goal here is to check the availability of ChainRPC sub-server before registering spend notification and looks like that is achieved by attempting to registering spend notification and checking on that exact error message if no error returned form the register notification we assume the server started that is why it succeeded
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thought: What about asserting periodically on the
synced_to_chainfor wallet from LND GetInfo RPC call we know thatsynced_to_chainis would be true after the server is started (with almost no delay since we got the most recent height) as seen in the referenced code block down below. That way we don't need to check on raw exact strings which can be volatile also this gives a chance to perhaps remove a fair amount of code in the retry mechanism down belowhttps://github.com/lightningnetwork/lnd/blob/096ab65b1d5b1f5b79b4e3ea2659e904de0eeda2/lnd.go#L703-L745
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the idea! I found that LND returns gRPC code Unknown with that error. I sent a PR to LND lightningnetwork/lnd#10352 so it returns Unavailable, which suits better this condition (the server is not ready to serve the request yet, try again later).
On lndclient side I kept both checks for now. When LND PR is merged and that version is released, we can drop the string matching code, leaving only code comparison, which is much more reliable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good