Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lima/roles/stop_dbs/tasks/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,7 @@
- name: Reload systemd
ansible.builtin.command: systemctl daemon-reload
changed_when: false

- name: Clear journalctl
ansible.builtin.command: journalctl --flush --rotate --vacuum-time=1s
changed_when: false
Comment thread
jason-lynch marked this conversation as resolved.
42 changes: 14 additions & 28 deletions server/internal/database/instance_resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ func (r *InstanceResource) Refresh(ctx context.Context, rc *resource.Context) er
}

func (r *InstanceResource) Create(ctx context.Context, rc *resource.Context) error {
if err := r.updateConnectionInfo(ctx, rc); err != nil {
return r.recordError(ctx, rc, err)
}
if err := WaitForPatroniRunning(ctx, r.patroniClient(), 0); err != nil {
err = fmt.Errorf("failed to wait for patroni to enter running state: %w", err)
return r.recordError(ctx, rc, err)
}
if err := r.initializeInstance(ctx, rc); err != nil {
return r.recordError(ctx, rc, err)
}
Expand All @@ -101,25 +108,17 @@ func (r *InstanceResource) Create(ctx context.Context, rc *resource.Context) err
}

func (r *InstanceResource) Update(ctx context.Context, rc *resource.Context) error {
// Get connection info from previous instance state in case the ports have
// changed.
previous, err := resource.FromContext[*InstanceResource](rc, r.Identifier())
if err != nil {
return r.recordError(ctx, rc, fmt.Errorf("failed to get previous instance state: %w", err))
if err := r.updateConnectionInfo(ctx, rc); err != nil {
return r.recordError(ctx, rc, err)
}
// We fallback to computing the connection info from the spec if the
// previous instance state is malformed.
if previous.ConnectionInfo != nil {
r.ConnectionInfo = previous.ConnectionInfo
} else if err := r.updateConnectionInfo(ctx, rc); err != nil {
patroniClient := r.patroniClient()
if err := WaitForPatroniRunning(ctx, patroniClient, 0); err != nil {
err = fmt.Errorf("failed to wait for patroni to enter running state: %w", err)
return r.recordError(ctx, rc, err)
}

if err := r.patroniClient().Reload(ctx); err != nil {
err = fmt.Errorf("failed to reload patroni conf: %w", err)
if err := r.restartIfNeeded(ctx, patroniClient); err != nil {
return r.recordError(ctx, rc, err)
}

if err := r.initializeInstance(ctx, rc); err != nil {
return r.recordError(ctx, rc, err)
}
Expand Down Expand Up @@ -164,20 +163,7 @@ func (r *InstanceResource) Connection(ctx context.Context, rc *resource.Context,
}

func (r *InstanceResource) initializeInstance(ctx context.Context, rc *resource.Context) error {
if err := r.updateConnectionInfo(ctx, rc); err != nil {
return err
}

patroniClient := r.patroniClient()

if err := WaitForPatroniRunning(ctx, patroniClient, 0); err != nil {
return fmt.Errorf("failed to wait for patroni to enter running state: %w", err)
}
if err := r.restartIfNeeded(ctx, patroniClient); err != nil {
return err
}

primaryInstanceID, err := GetPrimaryInstanceID(ctx, patroniClient, time.Minute)
primaryInstanceID, err := GetPrimaryInstanceID(ctx, r.patroniClient(), time.Minute)
if err != nil {
return err
}
Expand Down
7 changes: 7 additions & 0 deletions server/internal/database/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ import (
"github.com/pgEdge/control-plane/server/internal/utils"
)

// WaitForPatroniRunning polls the Patroni instance status endpoint until one of
// the following is true:
// - Patroni reports a running state
// - The context is canceled
// - The timeout has elapsed
// - We encounter more than 3 connection errors
// Giving a timeout of 0 will disable the timeout condition.
func WaitForPatroniRunning(ctx context.Context, patroniClient *patroni.Client, timeout time.Duration) error {
var cancel context.CancelFunc
if timeout > 0 {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ func (p *PatroniConfigGenerator) bootstrap(dcsParameters map[string]any) *patron
{Plugin: utils.PointerTo("spock_output")},
},
TTL: utils.PointerTo(30),
LoopWait: utils.PointerTo(10),
LoopWait: utils.PointerTo(int(patroni.DefaultLoopWaitSeconds)),
RetryTimeout: utils.PointerTo(10),
},
InitDB: utils.PointerTo([]string{"data-checksums"}),
Expand Down
30 changes: 28 additions & 2 deletions server/internal/orchestrator/swarm/patroni_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"maps"
"net/url"
"path/filepath"
"time"

"github.com/alessio/shellescape"
"github.com/samber/do"
Expand Down Expand Up @@ -178,7 +179,11 @@ func (c *PatroniConfig) Create(ctx context.Context, rc *resource.Context) error
}

func (c *PatroniConfig) Update(ctx context.Context, rc *resource.Context) error {
return c.Create(ctx, rc)
if err := c.Create(ctx, rc); err != nil {
return err
}

return c.signalReload(ctx, rc)
}

func (c *PatroniConfig) Delete(ctx context.Context, rc *resource.Context) error {
Expand Down Expand Up @@ -214,6 +219,27 @@ func (c *PatroniConfig) isNewNode(rc *resource.Context) (bool, error) {
}
}

func (c *PatroniConfig) signalReload(ctx context.Context, rc *resource.Context) error {
client, err := do.Invoke[*docker.Docker](rc.Injector)
if err != nil {
return err
}
// Signal the container if it exists
container, err := GetPostgresContainer(ctx, client, c.Spec.InstanceID)
if errors.Is(err, ErrNoPostgresContainer) {
return nil
} else if err != nil {
return fmt.Errorf("failed to check if postgres container exists: %w", err)
}
if err := client.ContainerSignal(ctx, container.ID, "SIGHUP"); err != nil {
return fmt.Errorf("failed to signal patroni to reload: %w", err)
}
// It can take up to loop_wait seconds for Patroni to reload the config.
// We'll want to update this code to read loop_wait from c.Spec if we make
// loop_wait configurable.
return utils.SleepContext(ctx, patroni.DefaultLoopWaitSeconds*time.Second)
}

func generatePatroniConfig(
spec *database.InstanceSpec,
instanceHostname string,
Expand Down Expand Up @@ -305,7 +331,7 @@ func generatePatroniConfig(
{Plugin: utils.PointerTo("spock_output")},
},
TTL: utils.PointerTo(30),
LoopWait: utils.PointerTo(10),
LoopWait: utils.PointerTo(int(patroni.DefaultLoopWaitSeconds)),
RetryTimeout: utils.PointerTo(10),
},
InitDB: utils.PointerTo([]string{"data-checksums"}),
Expand Down
32 changes: 27 additions & 5 deletions server/internal/orchestrator/systemd/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,24 +114,46 @@ func (c *Client) StopUnit(ctx context.Context, name string, wait bool) error {
return nil
}

func (c *Client) RestartUnit(ctx context.Context, name string) error {
func (c *Client) ReloadUnit(ctx context.Context, name string) error {
logger := c.logger.With().Str("unit", name).Logger()
logger.Debug().Msg("restarting unit")
logger.Debug().Msg("reloading unit")

resCh := make(chan string, 1)
pid, err := c.conn.ReloadUnitContext(ctx, name, "replace", resCh)
if err != nil {
return fmt.Errorf("failed to reload unit '%s': %w", name, err)
}
res, err := awaitJob(ctx, resCh)
if err != nil {
return fmt.Errorf("failed to reload unit '%s': %w", name, err)
}

c.logger.Debug().
Str("response", res).
Int("pid", pid).
Msg("reloaded unit")

return nil
}

func (c *Client) ReloadOrRestartUnit(ctx context.Context, name string) error {
logger := c.logger.With().Str("unit", name).Logger()
logger.Debug().Msg("reloading or restarting unit")

resCh := make(chan string, 1)
pid, err := c.conn.ReloadOrRestartUnitContext(ctx, name, "replace", resCh)
if err != nil {
return fmt.Errorf("failed to restart unit '%s': %w", name, err)
return fmt.Errorf("failed to reload or restart unit '%s': %w", name, err)
}
res, err := awaitJob(ctx, resCh)
if err != nil {
return fmt.Errorf("failed to restart unit '%s': %w", name, err)
return fmt.Errorf("failed to reload or restart unit '%s': %w", name, err)
}

c.logger.Debug().
Str("response", res).
Int("pid", pid).
Msg("restarted unit")
Msg("reloaded or restarted unit")

return nil
}
Expand Down
32 changes: 31 additions & 1 deletion server/internal/orchestrator/systemd/patroni_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@ package systemd

import (
"context"
"errors"
"fmt"
"strings"
"time"

"github.com/pgEdge/control-plane/server/internal/ds"
"github.com/pgEdge/control-plane/server/internal/host"
"github.com/pgEdge/control-plane/server/internal/orchestrator/common"
"github.com/pgEdge/control-plane/server/internal/patroni"
"github.com/pgEdge/control-plane/server/internal/resource"
"github.com/pgEdge/control-plane/server/internal/utils"
"github.com/samber/do"
)

Expand Down Expand Up @@ -79,9 +83,35 @@ func (c *PatroniConfig) Create(ctx context.Context, rc *resource.Context) error
}

func (c *PatroniConfig) Update(ctx context.Context, rc *resource.Context) error {
return c.Create(ctx, rc)
if err := c.Create(ctx, rc); err != nil {
return err
}

return c.signalReload(ctx, rc)
}

func (c *PatroniConfig) Delete(ctx context.Context, rc *resource.Context) error {
return c.Base.Delete(ctx, rc)
}

func (c *PatroniConfig) signalReload(ctx context.Context, rc *resource.Context) error {
client, err := do.Invoke[*Client](rc.Injector)
if err != nil {
return err
}
// Reload patroni unit if it exists
name := patroniServiceName(c.Base.InstanceID)
err = client.UnitExists(ctx, name)
if errors.Is(err, ErrUnitNotFound) {
return nil
} else if err != nil {
return fmt.Errorf("failed to check if patroni unit exists: %w", err)
}
if err := client.ReloadUnit(ctx, name); err != nil {
return fmt.Errorf("failed to reload patroni: %w", err)
}
// It can take up to loop_wait seconds for Patroni to reload the config.
// We'll want to update this code to read loop_wait from c.Base if we make
// loop_wait configurable.
return utils.SleepContext(ctx, patroni.DefaultLoopWaitSeconds*time.Second)
}
4 changes: 2 additions & 2 deletions server/internal/orchestrator/systemd/unit.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ func (r *UnitResource) Create(ctx context.Context, rc *resource.Context) error {
if err := client.EnableUnit(ctx, r.Name); err != nil {
return fmt.Errorf("failed to enable unit '%s': %w", path, err)
}
if err := client.RestartUnit(ctx, r.Name); err != nil {
return fmt.Errorf("failed to restart unit '%s': %w", path, err)
if err := client.ReloadOrRestartUnit(ctx, r.Name); err != nil {
return fmt.Errorf("failed to reload or restart unit '%s': %w", path, err)
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

return nil
Expand Down
2 changes: 2 additions & 0 deletions server/internal/patroni/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"github.com/pgEdge/control-plane/server/internal/storage"
)

const DefaultLoopWaitSeconds = 10

func Namespace() string {
return storage.Prefix("/", "patroni")
}
Expand Down
9 changes: 9 additions & 0 deletions server/internal/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ func Retry(maxAttempts int, initialDelay time.Duration, f func() error) error {
return nil
}

func SleepContext(ctx context.Context, duration time.Duration) error {
select {
case <-time.After(duration):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks it will sleep even if there is a cancellation.
Do you think we can change like below?

func SleepContext(ctx context.Context, duration time.Duration) error {
	t := time.NewTimer(duration)
	defer t.Stop()
	select {
	case <-t.C:
		return nil
	case <-ctx.Done():
		return ctx.Err()
	}
}

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the issue you're referring to was fixed in Go 1.23. The current docs for time.After say:

Before Go 1.23, this documentation warned that the underlying Timer would not be recovered by the garbage collector until the timer fired, and that if efficiency was a concern, code should use NewTimer instead and call Timer.Stop if the timer is no longer needed. As of Go 1.23, the garbage collector can recover unreferenced, unstopped timers. There is no reason to prefer NewTimer when After will do.

https://pkg.go.dev/time@master#After

return nil
case <-ctx.Done():
return ctx.Err()
}
}

func PointerTo[T any](v T) *T {
return &v
}
Expand Down