Skip to content

services/horizon: Check state rebuild in state verification integration test #3127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ commands:
command: |
cd ~/go/src/github.com/stellar/go
docker pull stellar/quickstart:testing
<<# parameters.enable-captive-core >>HORIZON_INTEGRATION_ENABLE_CAPTIVE_CORE=true<</ parameters.enable-captive-core >> go test -timeout 20m -v ./services/horizon/internal/integration/...
<<# parameters.enable-captive-core >>HORIZON_INTEGRATION_ENABLE_CAPTIVE_CORE=true<</ parameters.enable-captive-core >> go test -timeout 25m -v ./services/horizon/internal/integration/...

#-----------------------------------------------------------------------------#
# Jobs use the commands to accomplish a given task, and run through workflows #
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ import (
"github.com/stretchr/testify/assert"
)

const (
firstCheckpoint = (64 * (iota + 1)) - 1
secondCheckpoint
thirdCheckpoint
)

func TestProtocol14StateVerifier(t *testing.T) {
itest := integration.NewTest(t, protocol15Config)

Expand Down Expand Up @@ -99,36 +105,67 @@ func TestProtocol14StateVerifier(t *testing.T) {
assert.True(t, txResp.Successful)

// Reach the first checkpoint ledger
for !itest.LedgerIngested(63) {
err := itest.CloseCoreLedger()
assert.NoError(t, err)
time.Sleep(50 * time.Millisecond)
// Core will push to history archives *after* checkpoint ledger
itest.CloseCoreLedgersUntilSequence(firstCheckpoint + 1)
assert.NoError(t, err)
for !itest.LedgerIngested(firstCheckpoint) {
time.Sleep(time.Second)
}

verified := waitForStateVerifications(itest, 1)
if !verified {
t.Fatal("State verification not run...")
}

// Trigger state rebuild to check if ingesting from history archive works
itest.RunHorizonCLICommand("expingest", "trigger-state-rebuild")

// Wait for the second checkpoint ledger and state rebuild
// Core will push to history archives *after* checkpoint ledger
itest.CloseCoreLedgersUntilSequence(secondCheckpoint + 1)
assert.NoError(t, err)

// Wait for the third checkpoint ledger and state verification trigger
// Core will push to history archives *after* checkpoint ledger
itest.CloseCoreLedgersUntilSequence(thirdCheckpoint + 1)
assert.NoError(t, err)
for !itest.LedgerIngested(thirdCheckpoint) {
time.Sleep(time.Second)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, okay so now that I understand the checkpoint thing a little more, why do we check for secondCheckpoint, then thirdCheckpoint, and only then do the state verification? It seems the check for the second is redundant, no? Since it has to wait for the third as the next step, and third implies second.

Copy link
Contributor Author

@bartekn bartekn Oct 26, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ingestion is running a state machine (fsm.go file). During normal Horizon operations with usually stay in resumeState all the time. However, horizon ingest trigger-state-rebuild command makes the machine jump to startState. There, we fall into the case that makes us wait for the next checkpoint. Why? It's because after the first checkpoint and state verification we're at ledger above 63. But the last checkpoint is still 63. So we wait for the network to close the second ledger to be able to catchup. This won't trigger state verification because it's triggered from resumeState only. That's why we need to wait for the third checkpoint.


var metrics string
verified = waitForStateVerifications(itest, 2)
if !verified {
t.Fatal("State verification not run...")
}
}

func waitForStateVerifications(itest *integration.Test, count int) bool {
t := itest.CurrentTest()
// Check metrics until state verification run
for i := 0; i < 60; i++ {
for i := 0; i < 120; i++ {
t.Logf("Checking metrics (%d attempt)\n", i)
res, err := http.Get(fmt.Sprintf("http://localhost:%d/metrics", itest.AdminPort()))
res, err := http.Get(itest.MetricsURL())
assert.NoError(t, err)

metricsBytes, err := ioutil.ReadAll(res.Body)
res.Body.Close()
assert.NoError(t, err)
metrics = string(metricsBytes)
metrics := string(metricsBytes)

stateInvalid := strings.Contains(metrics, "horizon_ingest_state_invalid 1")
assert.False(t, stateInvalid, "State is invalid!")

notVerifiedYet := strings.Contains(metrics, "horizon_ingest_state_verify_duration_seconds_count 0")
notVerifiedYet := strings.Contains(
metrics,
fmt.Sprintf("horizon_ingest_state_verify_duration_seconds_count %d", count-1),
)
if notVerifiedYet {
time.Sleep(time.Second)
continue
}

return
return true
}

t.Fatal("State verification not run...")
return false
}
70 changes: 67 additions & 3 deletions services/horizon/internal/test/integration/integration.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//lint:file-ignore U1001 Ignore all unused code, this is only used in tests.
package integration

import (
Expand Down Expand Up @@ -333,18 +334,33 @@ func (i *Test) Client() *sdk.Client {
// ingested by Horizon. Panics in case of errors.
func (i *Test) LedgerIngested(sequence uint32) bool {
root, err := i.Client().Root()
if err != nil {
panic(err)
}
panicIf(err)

return root.IngestSequence >= sequence
}

// LedgerClosed returns true if the ledger with a given sequence has been
// closed by Stellar-Core. Panics in case of errors. Note it's different
// than LedgerIngested because it checks if the ledger was closed, not
// necessarily ingested (ex. when rebuilding state Horizon does not ingest
// recent ledgers).
func (i *Test) LedgerClosed(sequence uint32) bool {
root, err := i.Client().Root()
panicIf(err)

return root.CoreSequence >= int32(sequence)
}

// AdminPort returns Horizon admin port.
func (i *Test) AdminPort() int {
return 6060
}

// Metrics URL returns Horizon metrics URL.
func (i *Test) MetricsURL() string {
return fmt.Sprintf("http://localhost:%d/metrics", i.AdminPort())
}

// Master returns a keypair of the network master account.
func (i *Test) Master() *keypair.Full {
return keypair.Master(NetworkPassphrase).(*keypair.Full)
Expand Down Expand Up @@ -638,9 +654,43 @@ func (i *Test) CreateSignedTransaction(
return tx, nil
}

// CloseCoreLedgersUntilSequence will close ledgers until sequence.
// Note: because manualclose command doesn't block until ledger is actually
// closed, after running this method the last sequence can be higher than seq.
func (i *Test) CloseCoreLedgersUntilSequence(seq int) error {
for {
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
info, err := i.cclient.Info(ctx)
if err != nil {
return err
}

if info.Info.Ledger.Num >= seq {
return nil
}

i.t.Logf(
"Currently at ledger: %d, want: %d.",
info.Info.Ledger.Num,
seq,
)

err = i.CloseCoreLedger()
if err != nil {
return err
}
// manualclose command in core doesn't block until ledger is actually
// closed. Let's give it time to close the ledger.
time.Sleep(200 * time.Millisecond)
}
}

// CloseCoreLedgers will close one ledger.
func (i *Test) CloseCoreLedger() error {
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
i.t.Log("Closing one ledger manually...")
return i.cclient.ManualClose(ctx)
}

Expand Down Expand Up @@ -686,6 +736,20 @@ func (i *Test) LogFailedTx(txResponse proto.Transaction, horizonResult error) {
"Transaction doesn't have success code.")
}

func (i *Test) RunHorizonCLICommand(cmd ...string) {
fullCmd := append([]string{"/stellar/horizon/bin/horizon"}, cmd...)
id, err := i.cli.ContainerExecCreate(
context.Background(),
i.container.ID,
types.ExecConfig{
Cmd: fullCmd,
},
)
panicIf(err)
err = i.cli.ContainerExecStart(context.Background(), id.ID, types.ExecStartCheck{})
panicIf(err)
}

// Cluttering code with if err != nil is absolute nonsense.
func panicIf(err error) {
if err != nil {
Expand Down