Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions docs/using/etcd-reconfiguration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Etcd Mode Reconfiguration


This guide explains how to change a Control Plane host's etcd mode after cluster initialization.


## Overview


The Control Plane supports two etcd modes:


- **Server mode**: Runs an embedded etcd server and participates as a voting member
- **Client mode**: Connects to the etcd cluster as a client only


**Recommended topology:**
- 1-3 hosts: All should be etcd servers
- 4-7 hosts: 3 etcd servers, rest as clients
- 8+ hosts: 5 etcd servers, rest as clients


!!! warning "Maintain Odd Numbers"
Etcd requires an **odd number** of servers (3 or 5) for proper quorum.


## How It Works


Etcd mode reconfiguration is **fully automatic**:


1. Stop the container
2. Update `PGEDGE_ETCD_MODE` environment variable
3. Restart the container
4. The system automatically handles all cluster operations


**What happens automatically:**
- **Client→Server**: Discovers cluster, obtains credentials, joins as voting member
- **Server→Client**: Removes itself from membership, transitions to client mode


No manual API calls or configuration needed!


## Procedures


### Promoting a Client to Server (Example - host-4)


```bash
# 1. Stop the container
docker stop control-plane-host-4


# 2. Update docker-compose.yaml environment:
PGEDGE_ETCD_MODE: server # was: client


# 3. Restart
docker-compose up -d host-4


# 4. Verify (check logs)
docker logs control-plane-host-4
```


### Demoting a Server to Client (Example - host-4)


!!! warning "Quorum Check"
Ensure at least 2 other healthy servers remain before demotion.


```bash
# 1. Stop the container
docker stop control-plane-host-4


# 2. Update docker-compose.yaml environment:
PGEDGE_ETCD_MODE: client # was: server


# 3. Restart
docker-compose up -d host-4


# 4. Verify (check logs)
docker logs control-plane-host-4
```


## Troubleshooting


### Promotion Issues


**Problem**: Host fails to join cluster
**Solution**: Check logs for connection errors. Verify network connectivity and that other hosts are healthy.


**Problem**: "Permission denied" errors
**Solution**: System automatically obtains new credentials. If issue persists, check RBAC is enabled on cluster.


### Demotion Issues


**Problem**: Host fails to remove itself from membership
**Solution**: Check remaining servers have quorum. System continues transition even if removal fails.


**Problem**: Old data directory persists
**Solution**: System automatically cleans up etcd directory. If persists, manually remove after verifying host transitioned.


### General Troubleshooting


Check cluster health:


```bash
docker exec control-plane-host-1 etcdctl member list
```


All members should show `STATUS=started`.


## Best Practices


- **Change one host at a time** - Wait for completion before reconfiguring another
- **Monitor cluster health** - Verify all servers healthy before/after changes
- **Maintain odd numbers** - Always keep 3 or 5 etcd servers, never 2 or 4

## Summary


Etcd mode reconfiguration is fully automatic - just update the environment variable and restart. The Control Plane handles all cluster operations including credential provisioning, membership changes, and configuration updates without manual intervention.

54 changes: 8 additions & 46 deletions server/internal/api/apiv1/pre_init_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,14 @@ import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/base64"
"fmt"
"net/http"
"net/url"
"os"

"github.com/google/uuid"
goahttp "goa.design/goa/v3/http"

api "github.com/pgEdge/control-plane/api/apiv1/gen/control_plane"
"github.com/pgEdge/control-plane/api/apiv1/gen/http/control_plane/client"
"github.com/pgEdge/control-plane/server/internal/cluster"
"github.com/pgEdge/control-plane/server/internal/config"
"github.com/pgEdge/control-plane/server/internal/etcd"
Expand Down Expand Up @@ -93,20 +90,15 @@ func (s *PreInitHandlers) JoinCluster(ctx context.Context, token *api.ClusterJoi
return ErrInvalidServerURL
}

http_client, err := s.GetClient()

httpClient, err := s.GetClient()
if err != nil {
return err
}

enc := goahttp.RequestEncoder
dec := goahttp.ResponseDecoder //make our own
c := client.NewClient(serverURL.Scheme, serverURL.Host, http_client, enc, dec, false)
cli := &api.Client{
GetJoinOptionsEndpoint: c.GetJoinOptions(),
}
// Use shared API client creation utility
apiClient := etcd.CreateAPIClient(serverURL, httpClient)

opts, err := cli.GetJoinOptions(ctx, &api.ClusterJoinRequest{
opts, err := apiClient.GetJoinOptions(ctx, &api.ClusterJoinRequest{
HostID: api.Identifier(s.cfg.HostID),
Hostname: s.cfg.Hostname,
Ipv4Address: s.cfg.IPv4Address,
Expand All @@ -117,43 +109,13 @@ func (s *PreInitHandlers) JoinCluster(ctx context.Context, token *api.ClusterJoi
return apiErr(err)
}

caCert, err := base64.StdEncoding.DecodeString(opts.Credentials.CaCert)
if err != nil {
return apiErr(fmt.Errorf("failed to decode CA certificate: %w", err))
}
clientCert, err := base64.StdEncoding.DecodeString(opts.Credentials.ClientCert)
if err != nil {
return apiErr(fmt.Errorf("failed to decode client certificate: %w", err))
}
clientKey, err := base64.StdEncoding.DecodeString(opts.Credentials.ClientKey)
if err != nil {
return apiErr(fmt.Errorf("failed to decode client key: %w", err))
}
serverCert, err := base64.StdEncoding.DecodeString(opts.Credentials.ServerCert)
if err != nil {
return apiErr(fmt.Errorf("failed to decode server certificate: %w", err))
}
serverKey, err := base64.StdEncoding.DecodeString(opts.Credentials.ServerKey)
// Decode credentials using shared utility
joinOptions, err := etcd.DecodeJoinCredentials(opts)
if err != nil {
return apiErr(fmt.Errorf("failed to decode server key: %w", err))
return apiErr(err)
}

err = s.etcd.Join(ctx, etcd.JoinOptions{
Leader: &etcd.ClusterMember{
Name: opts.Leader.Name,
PeerURLs: opts.Leader.PeerUrls,
ClientURLs: opts.Leader.ClientUrls,
},
Credentials: &etcd.HostCredentials{
Username: opts.Credentials.Username,
Password: opts.Credentials.Password,
CaCert: caCert,
ClientCert: clientCert,
ClientKey: clientKey,
ServerCert: serverCert,
ServerKey: serverKey,
},
})
err = s.etcd.Join(ctx, *joinOptions)
if err != nil {
return apiErr(fmt.Errorf("failed to join existing cluster: %w", err))
}
Expand Down
5 changes: 5 additions & 0 deletions server/internal/etcd/embedded.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ func (e *EmbeddedEtcd) Start(ctx context.Context) error {
e.mu.Lock()
defer e.mu.Unlock()

if e.etcd != nil {
return nil // already started
}

initialized, err := e.IsInitialized()
if err != nil {
return err
Expand Down Expand Up @@ -292,6 +296,7 @@ func (e *EmbeddedEtcd) Shutdown() error {
}
if e.etcd != nil {
e.etcd.Close()
e.etcd = nil
}
return errors.Join(errs...)
}
Expand Down
60 changes: 54 additions & 6 deletions server/internal/etcd/provide.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package etcd

import (
"context"
"fmt"
"time"

"github.com/rs/zerolog"
"github.com/samber/do"
Expand All @@ -27,6 +29,18 @@ func provideClient(i *do.Injector) {
})
}

// newEtcdForMode creates an Etcd instance based on the specified mode.
func newEtcdForMode(mode config.EtcdMode, cfg *config.Manager, logger zerolog.Logger) (Etcd, error) {
switch mode {
case config.EtcdModeServer:
return NewEmbeddedEtcd(cfg, logger), nil
case config.EtcdModeClient:
return NewRemoteEtcd(cfg, logger), nil
default:
return nil, fmt.Errorf("invalid etcd mode: %s", mode)
}
}

func provideEtcd(i *do.Injector) {
do.Provide(i, func(i *do.Injector) (Etcd, error) {
cfg, err := do.Invoke[*config.Manager](i)
Expand All @@ -38,13 +52,47 @@ func provideEtcd(i *do.Injector) {
return nil, err
}

switch storageType := cfg.Config().EtcdMode; storageType {
case config.EtcdModeServer:
return NewEmbeddedEtcd(cfg, logger), nil
case config.EtcdModeClient:
return NewRemoteEtcd(cfg, logger), nil
appCfg := cfg.Config()
generated := cfg.GeneratedConfig()

oldMode := generated.EtcdMode
newMode := appCfg.EtcdMode

logger.Info().
Str("old_mode", string(oldMode)).
Str("new_mode", string(newMode)).
Bool("old_mode_empty", oldMode == "").
Bool("modes_equal", oldMode == newMode).
Msg("checking etcd mode for reconfiguration")

// First startup (no generated config yet) or no change: use the configured mode.
if oldMode == "" || oldMode == newMode {
logger.Info().
Str("mode", string(newMode)).
Bool("first_startup", oldMode == "").
Msg("creating new etcd instance for mode (no reconfiguration needed)")
return newEtcdForMode(newMode, cfg, logger)
}

// Mode has changed - perform reconfiguration.
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()

logger.Info().
Str("host_id", appCfg.HostID).
Str("old_mode", string(oldMode)).
Str("new_mode", string(newMode)).
Msg("detected etcd_mode change, performing reconfiguration")

switch {
case oldMode == config.EtcdModeServer && newMode == config.EtcdModeClient:
return reconfigureServerToClient(ctx, cfg, logger)

case oldMode == config.EtcdModeClient && newMode == config.EtcdModeServer:
return reconfigureClientToServer(ctx, cfg, logger)

default:
return nil, fmt.Errorf("invalid storage type: %s", storageType)
return nil, fmt.Errorf("unsupported etcd mode transition: %s -> %s", oldMode, newMode)
}
})
}
Expand Down
2 changes: 2 additions & 0 deletions server/internal/etcd/rbac.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,8 @@ func writeHostCredentials(creds *HostCredentials, cfg *config.Manager) error {
generatedCfg := cfg.GeneratedConfig()
generatedCfg.EtcdUsername = creds.Username
generatedCfg.EtcdPassword = creds.Password
generatedCfg.EtcdMode = appCfg.EtcdMode

if err := cfg.UpdateGeneratedConfig(generatedCfg); err != nil {
return fmt.Errorf("failed to update generated config: %w", err)
}
Expand Down
Loading