DB-HA架構

DB HA

Step by Step PostgreSQL High Availability Setup using Patroni & Etcd & HAProxy for Debian (Ubuntu)

以下
192.168.1.65 -ha ip
192.168.1.67 -db1 ip
192.168.1.69 -db2 ip

一開始有廢話的

sudo vi /etc/hosts

192.168.1.67 pg-patroni01
192.168.1.69 pg-patroni01
192.168.1.65 pg-proxy

work機

node-bash

sudo hostnamectl set-hostname pg-patroni02
sudo vi /etc/hosts
sudo sh -c \'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list\'
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt-get update
sudo apt -y install postgresql-15 postgresql-server-dev-15
sudo ln -s /usr/lib/postgresql/15/bin/* /usr/sbin/
sudo -u postgres psql   # 改密碼
systemctl stop postgresql.service
systemctl status postgresql.service
sudo apt -y install python3 python3-pip
sudo apt install python3-pip python3-dev libpq-dev -y
sudo -H pip3 install psycopg2 --break-system-packages
sudo -H pip3 install patroni --break-system-packages
sudo -H pip3 install python-etcd --break-system-packages
sudo mkdir -p /data/patroni
sudo chown postgres:postgres /data/patroni/
sudo chmod 700 /data/patroni/
sudo vi /etc/patroni.yml
sudo vi /etc/systemd/system/patroni.service
sudo systemctl daemon-reload
sudo systemctl enable patroni
sudo systemctl enable postgresql
sudo systemctl start patroni
sudo systemctl start postgresql
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/systemd/system/patroni.service
sudo vi /etc/patroni.yml
sudo systemctl daemon-reload
sudo systemctl enable patroni
sudo systemctl enable postgresql
sudo systemctl start patroni
sudo systemctl start postgresql
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/patroni.yml
sudo ss -ltnp | grep 5432
journalctl -u patroni -f
sudo vi /etc/systemd/system/etcd.service
sudo systemctl daemon-reload
sudo systemctl enable etcd
sudo systemctl start etcd
curl http://192.168.1.69:2379/v2/keys/
sudo patronictl -c /etc/patroni.yml list
sudo systemctl restart patroni
curl http://192.168.1.67:8008/readiness
sudo journalctl -u patroni -f
psql -h 192.168.1.65 -p 5000 -U postgres -W
sudo patronictl -c /etc/patroni.yml list

sudo vi /etc/patroni.yml

“` yml=
scope: postgres
namespace: /db/
name: patronidb2

restapi:
listen: 192.168.1.69:8008
connect_address: 192.168.1.69:8008

etcd3:
host: 192.168.1.65:2379

bootstrap:
dcs:
ttl: 30
loop_wait: 10
retry_timeout: 10
maximum_lag_on_failover: 1048576
postgresql:
use_pg_rewind: true
initdb:
– encoding: UTF8
– data-checksums
pg_hba:
– host replication replicator 127.0.0.1/32 md5
– host replication replicator 192.168.1.69/0 md5
– host replication replicator 192.168.1.67/0 md5
– host all all 0.0.0.0/0 md5
users:
admin:
password: admin
options:
– createrole
– createdb
postgresql:
listen: 192.168.1.69:5432
connect_address: 192.168.1.69:5432
data_dir: /data/patroni
pgpass: /tmp/pgpass
authentication:
replication:
username: replicator
password: replicator2024
superuser:
username: postgres
password: postgres2024
parameters:
unix_socket_directories: \’.\’
tags:
nofailover: false
noloadbalance: false
clonefrom: false
nosync: false

#### sudo vi /etc/systemd/system/patroni.service

``` yml=
[Unit]
Description=Runners to orchestrate a high-availability PostgreSQL
After=syslog.target network.target

[Service]
Type=simple

User=postgres
Group=postgres

ExecStart=/usr/local/bin/patroni /etc/patroni.yml
KillMode=process
TimeoutSec=30
Restart=no

[Install]
WantedBy=multi-user.target

sudo vi /etc/systemd/system/etcd.service

“`yaml=
[Unit]
Description=etcd key-value store
After=network.target

[Service]
Type=notify
ExecStart=/usr/local/bin/etcd \
–name node1 \
–data-dir /var/lib/etcd \
–listen-peer-urls http://192.168.1.69:2380 \
–listen-client-urls http://192.168.1.69:2379,http://127.0.0.1:2379 \
–advertise-client-urls http://192.168.1.69:2379 \
–initial-advertise-peer-urls http://192.168.1.69:2380 \
–initial-cluster node1=http://192.168.1.69:2380 \
–initial-cluster-token etcd-cluster \
–initial-cluster-state new \
–enable-v2=true
Restart=always
RestartSec=5
LimitNOFILE=40000

[Install]
WantedBy=multi-user.target

#### 改帳密
```sql=
# psql
postgres=# ALTER USER postgres PASSWORD \'postgres2024\';
postgres=# CREATE USER replicator WITH ENCRYPTED PASSWORD \'replicator2024\';

ha

bash

sudo hostnamectl set-hostname pg-proxy
sudo vi /etc/hosts
sudo apt-get update
sudo apt update && sudo apt upgrade -y
sudo apt install -y etcd-server haproxy
sudo vi /etc/default/etcd
sudo systemctl restart etcd
sudo systemctl status etcd
sudo vi /etc/haproxy/haproxy.cfg
sudo systemctl start haproxy
sudo systemctl enable haproxy
sudo systemctl restart haproxy
sudo systemctl status haproxy
sudo haproxy -c -V -f /etc/haproxy/haproxy.cfg
sudo netstat -tulnp | grep 5000
sudo ufw allow 5000/tcp
curl http://192.168.1.67:8008/health
sudo journalctl -u haproxy -f
curl http://192.168.1.65:2379/v2/keys/
sudo systemctl restart haproxy
sudo systemctl status haproxy
curl -v http://192.168.1.67:8008/readiness
sudo journalctl -u haproxy -f
sudo systemctl status haproxy
history

sudo vi /etc/default/etcd

ETCD_LISTEN_PEER_URLS="http://192.168.1.65:2380,http://127.0.0.1:7001"
ETCD_LISTEN_CLIENT_URLS="http://127.0.0.1:2379, http://192.168.1.65:2379"
ETCD_INITIAL_ADVERTISE_PEER_URLS="http://192.168.1.65:2380"
ETCD_INITIAL_CLUSTER="etcd0=http://192.168.1.65:2380,"
ETCD_ADVERTISE_CLIENT_URLS="http://192.168.1.65:2379"
ETCD_INITIAL_CLUSTER_TOKEN="cluster1"
ETCD_INITIAL_CLUSTER_STATE="new"
ETCD_ENABLE_V2="true"

sudo vi /etc/haproxy/haproxy.cfg

global
    maxconn     4000

defaults
    log global
    mode tcp
    retries 2
    timeout connect 4s
    timeout client 30m
    timeout server 30m
    timeout check 5s


listen stats
    mode http
    bind 192.168.1.65:7000
    stats enable
    stats uri /

listen postgres
    bind *:5000
    option httpchk
    http-check expect status 200
    default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
    server node1 192.168.1.67:5432 maxconn 1000 check port 8008
    server node2 192.168.1.69:5432 maxconn 1000 check port 8008
  ```

![image](https://hackmd.io/_uploads/rkoexWiZle.png)
![image](https://hackmd.io/_uploads/H1b-xZj-le.png)
![image](https://hackmd.io/_uploads/H1xt-lWobge.png)
![image](https://hackmd.io/_uploads/BJJfx-sWgg.png)

## 去除廢話版本
### work機
#### bash
```bash=
sudo sh -c \'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list\'
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt-get update
sudo apt -y install postgresql-15 postgresql-server-dev-15
sudo ln -s /usr/lib/postgresql/15/bin/* /usr/sbin/
sudo -u postgres psql
systemctl stop postgresql.service
systemctl status postgresql.service
sudo apt -y install python3 python3-pip
sudo apt install python3-pip python3-dev libpq-dev -y
sudo -H pip3 install psycopg2 --break-system-packages
sudo -H pip3 install patroni --break-system-packages
sudo -H pip3 install python-etcd --break-system-packages
sudo mkdir -p /data/patroni
sudo chown postgres:postgres /data/patroni/
sudo chmod 700 /data/patroni/
sudo vi /etc/patroni.yml
sudo vi /etc/systemd/system/patroni.service
sudo systemctl daemon-reload
sudo systemctl enable patroni
sudo systemctl enable postgresql
sudo systemctl start patroni
sudo systemctl start postgresql
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/systemd/system/etcd.service
sudo patronictl -c /etc/patroni.yml list

sudo -u postgres psql

ALTER USER postgres PASSWORD \'postgres2024\';
CREATE USER replicator WITH ENCRYPTED PASSWORD \'replicator2024\';

sudo vi /etc/patroni.yml

scope: postgres
namespace: /db/
name: patronidb2

restapi:
    listen: 192.168.1.74:8008
    connect_address: 192.168.1.74:8008

etcd3:
    host: 192.168.1.73:2379

bootstrap:
    dcs:
        ttl: 30
        loop_wait: 10
        retry_timeout: 10
        maximum_lag_on_failover: 1048576
        postgresql:
            use_pg_rewind: true
    initdb:
    - encoding: UTF8
    - data-checksums
    pg_hba:
    - host replication replicator 127.0.0.1/32 md5
    - host replication replicator 192.168.1.74/0 md5
    - host replication replicator 192.168.1.57/0 md5
    - host all all 0.0.0.0/0 md5
users:
        admin:
            password: admin
            options:
                - createrole
                - createdb
postgresql:
    listen: 192.168.1.74:5432
    connect_address: 192.168.1.74:5432
    data_dir: /data/patroni
    pgpass: /tmp/pgpass
    authentication:
        replication:
            username: replicator
            password: replicator2024
        superuser:
            username: postgres
            password: postgres2024
    parameters:
        unix_socket_directories: \'.\'
tags:
    nofailover: false
    noloadbalance: false
    clonefrom: false
    nosync: false

sudo vi /etc/systemd/system/patroni.service

[Unit]
Description=Runners to orchestrate a high-availability PostgreSQL
After=syslog.target network.target

[Service]
Type=simple

User=postgres
Group=postgres

ExecStart=/usr/local/bin/patroni /etc/patroni.yml
KillMode=process
TimeoutSec=30
Restart=no

[Install]
WantedBy=multi-user.target

sudo vi /etc/systemd/system/etcd.service

[Unit]
Description=etcd key-value store
After=network.target

[Service]
Type=notify
ExecStart=/usr/local/bin/etcd \\
  --name node1 \\
  --data-dir /var/lib/etcd \\
  --listen-peer-urls http://192.168.1.74:2380 \\
  --listen-client-urls http://192.168.1.74:2379,http://127.0.0.1:2379 \\
  --advertise-client-urls http://192.168.1.74:2379 \\
  --initial-advertise-peer-urls http://192.168.1.74:2380 \\
  --initial-cluster node1=http://192.168.1.74:2380 \\
  --initial-cluster-token etcd-cluster \\
  --initial-cluster-state new \\
  --enable-v2=true
Restart=always
RestartSec=5
LimitNOFILE=40000

[Install]
WantedBy=multi-user.target

ha機

bash
“`bash=
sudo apt-get update
sudo apt update && sudo apt upgrade -y
sudo apt install -y etcd-server haproxy
sudo vi /etc/default/etcd
sudo vi /etc/haproxy/haproxy.cfg
sudo systemctl restart etcd
sudo systemctl status etcd
sudo systemctl start haproxy
sudo systemctl enable haproxy
sudo systemctl restart haproxy
sudo systemctl status haproxy
sudo haproxy -c -V -f /etc/haproxy/haproxy.cfg
sudo netstat -tulnp | grep 5000
sudo ufw allow 5000/tcp
sudo systemctl status haproxy


sudo vi /etc/default/etcd

ETCD_LISTEN_PEER_URLS=”http://192.168.1.73:2380,http://127.0.0.1:7001″
ETCD_LISTEN_CLIENT_URLS=”http://127.0.0.1:2379, http://192.168.1.73:2379″
ETCD_INITIAL_ADVERTISE_PEER_URLS=”http://192.168.1.73:2380″
ETCD_INITIAL_CLUSTER=”etcd0=http://192.168.1.73:2380,”
ETCD_ADVERTISE_CLIENT_URLS=”http://192.168.1.73:2379″
ETCD_INITIAL_CLUSTER_TOKEN=”cluster1″
ETCD_INITIAL_CLUSTER_STATE=”new”
ETCD_ENABLE_V2=”true”


sudo vi /etc/haproxy/haproxy.cfg

global
maxconn 4000

defaults
log global
mode tcp
retries 2
timeout connect 4s
timeout client 30m
timeout server 30m
timeout check 5s

listen stats
mode http
bind 192.168.1.73:7000
stats enable
stats uri /

listen postgres
bind *:5000
option httpchk
http-check expect status 200
default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server node1 192.168.1.57:5432 maxconn 1000 check port 8008
server node2 192.168.1.74:5432 maxconn 1000 check port 8008


![image](https://hackmd.io/_uploads/H1rPyfi-ex.png)
> **INFO:** ## Patroni 同步複製完整實施方案

### 1. Patroni 集群架構

```yaml
# patroni.yml 關鍵配置
postgresql:
  parameters:
    synchronous_commit: \'on\'
    synchronous_standby_names: \'ANY 1 (replica1, replica2)\'
    max_wal_senders: 10
    wal_level: replica
    hot_standby: on

bootstrap:
  dcs:
    ttl: 30
    loop_wait: 10
    retry_timeout: 30
    maximum_lag_on_failover: 1048576
    synchronous_mode: true
    synchronous_mode_strict: true

節點配置:
– Primary: 主寫入節點
– Replica1: 同步備份(同機房)
– Replica2: 同步備份(異機房)
– etcd 集群: 3 節點分散式配置存儲

2. 監控系統架構

核心監控組件

Prometheus + Grafana 棧:

# docker-compose.yml
services:
  prometheus:
    image: prom/prometheus
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml

  grafana:
    image: grafana/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=secure_password

  postgres_exporter:
    image: prometheuscommunity/postgres-exporter
    environment:
      - DATA_SOURCE_NAME=postgresql://monitor:password@postgres:5432/postgres?sslmode=disable

關鍵監控指標:
pg_up: 數據庫可用性
pg_replication_lag: 複製延遲
pg_stat_replication: 複製狀態
patroni_master: 主節點狀態
patroni_replica_lag_in_seconds: Patroni 複製延遲

預警規則配置

# prometheus rules
groups:
- name: postgresql
  rules:
  - alert: PostgreSQLDown
    expr: pg_up == 0
    for: 30s
    labels:
      severity: critical
    annotations:
      summary: "PostgreSQL 實例宕機"

  - alert: ReplicationLag
    expr: pg_replication_lag > 100
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "複製延遲超過 100MB"

  - alert: PatroniMasterMissing
    expr: absent(patroni_master == 1)
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Patroni 主節點缺失"

3. 日誌監控

ELK Stack 配置:

# logstash pipeline
input {
  file {
    path => "/var/log/postgresql/*.log"
    type => "postgresql"
  }
  file {
    path => "/var/log/patroni/*.log"
    type => "patroni"
  }
}

filter {
  if [type] == "postgresql" {
    grok {
      match => { "message" => "%{TIMESTAMP_ISO8601:timestamp} \\[%{INT:pid}\\] %{WORD:level}: %{GREEDYDATA:msg}" }
    }
  }
}

4. 自動化腳本

健康檢查腳本

!/bin/bash

PostgreSQL HA 健康檢查腳本

檢查 Patroni 集群狀態和複製健康度

set -euo pipefail

配置變數

PATRONI_HOST=”localhost”
PATRONI_PORT=”8008″
PG_HOST=”localhost”
PG_PORT=”5432″
PG_USER=”postgres”
ALERT_THRESHOLD_LAG=100 # MB
WEBHOOK_URL=”https://hooks.slack.com/your-webhook”

日誌函數

log() {
echo “[$(date \’+%Y-%m-%d %H:%M:%S\’)] $1” | tee -a /var/log/pg-health-check.log
}

檢查 Patroni 集群狀態

check_patroni_cluster() {
log “檢查 Patroni 集群狀態…”

local status
status=$(curl -s "http://${PATRONI_HOST}:${PATRONI_PORT}/cluster" || echo "ERROR")

if [[ "$status" == "ERROR" ]]; then
    log "ERROR: 無法連接到 Patroni API"
    send_alert "Patroni API 不可達" "critical"
    return 1
fi

# 解析集群狀態
local leader_count
leader_count=$(echo "$status" | jq \'.members | map(select(.role == "Leader")) | length\')

if [[ "$leader_count" -ne 1 ]]; then
    log "ERROR: 集群中有 $leader_count 個 Leader"
    send_alert "Patroni 集群狀態異常: ${leader_count} 個 Leader" "critical"
    return 1
fi

log "Patroni 集群狀態正常"
return 0

}

檢查複製延遲

check_replication_lag() {
log “檢查複製延遲…”

local query="SELECT client_addr, state, pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn)) as lag FROM pg_stat_replication;"

local result
result=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -t -c "$query" 2>/dev/null || echo "ERROR")

if [[ "$result" == "ERROR" ]]; then
    log "ERROR: 無法查詢複製狀態"
    send_alert "無法查詢 PostgreSQL 複製狀態" "critical"
    return 1
fi

# 檢查每個複製節點的延遲
while IFS=\'|\' read -r client_addr state lag; do
    client_addr=$(echo "$client_addr" | xargs)
    state=$(echo "$state" | xargs)
    lag=$(echo "$lag" | xargs)

    if [[ "$state" != "streaming" ]]; then
        log "WARNING: 複製節點 $client_addr 狀態異常: $state"
        send_alert "複製節點 $client_addr 狀態: $state" "warning"
    fi

    # 轉換延遲為 MB
    local lag_mb
    if [[ "$lag" =~ ([0-9]+)\\ MB ]]; then
        lag_mb=${BASH_REMATCH[1]}
        if [[ "$lag_mb" -gt "$ALERT_THRESHOLD_LAG" ]]; then
            log "WARNING: 複製節點 $client_addr 延遲過大: ${lag_mb}MB"
            send_alert "複製延遲告警: $client_addr 延遲 ${lag_mb}MB" "warning"
        fi
    fi

    log "複製節點 $client_addr: $state, 延遲: $lag"
done <<< "$result"

return 0

}

檢查數據庫連接

check_db_connection() {
log “檢查數據庫連接…”

local result
result=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -c "SELECT 1;" -t 2>/dev/null || echo "ERROR")

if [[ "$result" == "ERROR" ]]; then
    log "ERROR: 無法連接到 PostgreSQL"
    send_alert "PostgreSQL 數據庫連接失敗" "critical"
    return 1
fi

log "數據庫連接正常"
return 0

}

檢查磁盤空間

check_disk_space() {
log “檢查磁盤空間…”

local data_dir
data_dir=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -t -c "SHOW data_directory;" | xargs)

local usage
usage=$(df "$data_dir" | awk \'NR==2 {print $5}\' | sed \'s/%//\')

if [[ "$usage" -gt 85 ]]; then
    log "WARNING: 數據目錄磁盤使用率: ${usage}%"
    send_alert "磁盤空間告警: 使用率 ${usage}%" "warning"
elif [[ "$usage" -gt 95 ]]; then
    log "CRITICAL: 數據目錄磁盤使用率: ${usage}%"
    send_alert "磁盤空間嚴重告警: 使用率 ${usage}%" "critical"
fi

log "磁盤使用率: ${usage}%"

}

發送告警

send_alert() {
local message=”$1″
local severity=”$2″

local payload
payload=$(cat <<EOF

{
“text”: “PostgreSQL HA Alert”,
“attachments”: [
{
“color”: “$([[ “$severity” == “critical” ]] && echo “danger” || echo “warning”)”,
“fields”: [
{
“title”: “Severity”,
“value”: “$severity”,
“short”: true
},
{
“title”: “Message”,
“value”: “$message”,
“short”: false
},
{
“title”: “Host”,
“value”: “$(hostname)”,
“short”: true
},
{
“title”: “Time”,
“value”: “$(date)”,
“short”: true
}
]
}
]
}
EOF
)

curl -X POST -H \'Content-type: application/json\' --data "$payload" "$WEBHOOK_URL" || true

}

生成健康報告

generate_health_report() {
log “生成健康檢查報告…”

local report_file="/tmp/pg-health-report-$(date +%Y%m%d-%H%M%S).json"

# 收集各種狀態信息
local patroni_status
patroni_status=$(curl -s "http://${PATRONI_HOST}:${PATRONI_PORT}/cluster" || echo \'{}\')

local pg_stats
pg_stats=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -c "
    SELECT json_build_object(
        \'connections\', (SELECT count(*) FROM pg_stat_activity),
        \'database_size\', pg_size_pretty(pg_database_size(current_database())),
        \'uptime\', date_trunc(\'second\', now() - pg_postmaster_start_time()),
        \'version\', version()
    );" -t | xargs 2>/dev/null || echo \'{}\')

# 生成 JSON 報告
cat > "$report_file" <<EOF

{
“timestamp”: “$(date -Iseconds)”,
“hostname”: “$(hostname)”,
“patroni_cluster”: $patroni_status,
“postgresql_stats”: $pg_stats,
“disk_usage”: “$(df /var/lib/postgresql/data | awk \’NR==2 {print $5}\’ 2>/dev/null || echo \’N/A\’)”
}
EOF

log "健康報告已生成: $report_file"

}

主執行函數

main() {
log “開始 PostgreSQL HA 健康檢查…”

local exit_code=0

check_db_connection || exit_code=1
check_patroni_cluster || exit_code=1
check_replication_lag || exit_code=1
check_disk_space || exit_code=1

generate_health_report

if [[ $exit_code -eq 0 ]]; then
    log "所有健康檢查通過"
else
    log "健康檢查發現問題,請查看日誌詳情"
fi

return $exit_code

}

腳本入口

if [[ “${BASH_SOURCE[0]}” == “${0}” ]]; then
main “$@”
fi

自動故障轉移腳本

!/bin/bash

Patroni 自動故障轉移管理腳本

監控集群狀態並執行必要的故障轉移操作

set -euo pipefail

配置變數

PATRONI_HOSTS=(“patroni1:8008” “patroni2:8008” “patroni3:8008”)
FAILOVER_TIMEOUT=60
MAX_FAILOVER_ATTEMPTS=3
NOTIFICATION_WEBHOOK=”https://hooks.slack.com/your-webhook”
LOG_FILE=”/var/log/patroni-failover.log”

日誌函數

log() {
echo “[$(date \’+%Y-%m-%d %H:%M:%S\’)] $1” | tee -a “$LOG_FILE”
}

發送通知

send_notification() {
local title=”$1″
local message=”$2″
local severity=”${3:-info}”

local color="good"
case "$severity" in
    "critical") color="danger" ;;
    "warning") color="warning" ;;
esac

local payload
payload=$(cat <<EOF

{
“text”: “$title”,
“attachments”: [
{
“color”: “$color”,
“fields”: [
{
“title”: “Message”,
“value”: “$message”,
“short”: false
},
{
“title”: “Host”,
“value”: “$(hostname)”,
“short”: true
},
{
“title”: “Time”,
“value”: “$(date)”,
“short”: true
}
]
}
]
}
EOF
)

curl -X POST -H \'Content-type: application/json\' --data "$payload" "$NOTIFICATION_WEBHOOK" &>/dev/null || true

}

獲取集群狀態

get_cluster_status() {
local host=”$1″

local status
status=$(curl -s --connect-timeout 5 "http://$host/cluster" 2>/dev/null || echo "ERROR")

if [[ "$status" == "ERROR" ]]; then
    return 1
fi

echo "$status"
return 0

}

查找可用的 Patroni 節點

find_available_patroni() {
for host in “${PATRONI_HOSTS[@]}”; do
if get_cluster_status “$host” &>/dev/null; then
echo “$host”
return 0
fi
done
return 1
}

檢查集群健康狀態

check_cluster_health() {
local patroni_host
if ! patroni_host=$(find_available_patroni); then
log “ERROR: 無法連接到任何 Patroni 節點”
return 1
fi

local status
if ! status=$(get_cluster_status "$patroni_host"); then
    log "ERROR: 無法獲取集群狀態"
    return 1
fi

# 解析集群狀態
local leader_count
leader_count=$(echo "$status" | jq \'.members | map(select(.role == "Leader")) | length\' 2>/dev/null || echo "0")

local total_members
total_members=$(echo "$status" | jq \'.members | length\' 2>/dev/null || echo "0")

local running_members
running_members=$(echo "$status" | jq \'.members | map(select(.state == "running")) | length\' 2>/dev/null || echo "0")

# 輸出狀態摘要
cat <<EOF

{
“leader_count”: $leader_count,
“total_members”: $total_members,
“running_members”: $running_members,
“status”: $status
}
EOF

return 0

}

執行手動故障轉移

perform_manual_failover() {
local target_node=”$1″
local patroni_host=”$2″

log "執行手動故障轉移到節點: $target_node"

local result
result=$(curl -s -X POST "http://$patroni_host/failover" \\
    -H "Content-Type: application/json" \\
    -d "{\\"leader\\": \\"$target_node\\"}" 2>/dev/null || echo "ERROR")

if [[ "$result" == "ERROR" ]]; then
    log "ERROR: 故障轉移請求失敗"
    return 1
fi

log "故障轉移請求已提交,等待完成..."
sleep 10

# 驗證故障轉移結果
local attempts=0
while [[ $attempts -lt $FAILOVER_TIMEOUT ]]; do
    local health_status
    if health_status=$(check_cluster_health); then
        local leader_count
        leader_count=$(echo "$health_status" | jq \'.leader_count\')

        if [[ "$leader_count" -eq 1 ]]; then
            log "故障轉移成功完成"
            send_notification "故障轉移完成" "集群已成功轉移到新的主節點" "info"
            return 0
        fi
    fi

    sleep 2
    ((attempts += 2))
done

log "ERROR: 故障轉移在 $FAILOVER_TIMEOUT 秒內未完成"
return 1

}

自動選擇最佳故障轉移目標

select_failover_target() {
local status=”$1″

# 查找狀態為 running 的 replica 節點
local target
target=$(echo "$status" | jq -r \'.members[] | select(.role == "Replica" and .state == "running") | .name\' | head -n1)

if [[ -n "$target" && "$target" != "null" ]]; then
    echo "$target"
    return 0
fi

return 1

}

處理集群異常狀態

handle_cluster_issues() {
local health_status=”$1″

local leader_count
leader_count=$(echo "$health_status" | jq \'.leader_count\')

local running_members
running_members=$(echo "$health_status" | jq \'.running_members\')

local total_members
total_members=$(echo "$health_status" | jq \'.total_members\')

# 沒有 Leader 的情況
if [[ "$leader_count" -eq 0 ]]; then
    log "CRITICAL: 集群中沒有 Leader 節點"
    send_notification "集群異常" "沒有可用的 Leader 節點" "critical"

    if [[ "$running_members" -gt 0 ]]; then
        log "嘗試從 $running_members 個運行中的節點選擇新 Leader"

        local status
        status=$(echo "$health_status" | jq \'.status\')

        local target
        if target=$(select_failover_target "$status"); then
            log "選擇節點 $target 作為新 Leader"

            local patroni_host
            if patroni_host=$(find_available_patroni); then
                perform_manual_failover "$target" "$patroni_host"
            else
                log "ERROR: 無法找到可用的 Patroni API 端點"
            fi
        else
            log "ERROR: 無法找到合適的故障轉移目標"
        fi
    else
        log "CRITICAL: 沒有運行中的節點可以提升為 Leader"
        send_notification "集群嚴重異常" "所有節點都不可用" "critical"
    fi

    return 1
fi

# 多個 Leader 的情況(腦裂)
if [[ "$leader_count" -gt 1 ]]; then
    log "CRITICAL: 檢測到腦裂狀態,有 $leader_count 個 Leader"
    send_notification "集群腦裂" "檢測到 $leader_count 個 Leader 節點" "critical"

    # 在腦裂情況下,通常需要人工介入
    log "腦裂狀況需要人工介入處理"
    return 1
fi

# 節點數量不足
local min_nodes=2
if [[ "$running_members" -lt $min_nodes ]]; then
    log "WARNING: 運行中的節點數量不足 ($running_members < $min_nodes)"
    send_notification "集群節點不足" "只有 $running_members 個節點在運行" "warning"
fi

return 0

}

主監控循環

monitor_cluster() {
log “開始監控 Patroni 集群…”

local consecutive_failures=0
local max_consecutive_failures=3

while true; do
    local health_status
    if health_status=$(check_cluster_health); then
        consecutive_failures=0

        local leader_count
        leader_count=$(echo "$health_status" | jq \'.leader_count\')

        local running_members
        running_members=$(echo "$health_status" | jq \'.running_members\')

        log "集群狀態正常: $leader_count Leader, $running_members 個運行節點"

        # 檢查是否需要處理異常狀態
        if [[ "$leader_count" -ne 1 ]] || [[ "$running_members" -lt 2 ]]; then
            handle_cluster_issues "$health_status"
        fi
    else
        ((consecutive_failures++))
        log "無法獲取集群狀態 (連續失敗: $consecutive_failures)"

        if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
            log "CRITICAL: 連續 $consecutive_failures 次無法獲取集群狀態"
            send_notification "監控異常" "連續 $consecutive_failures 次無法獲取集群狀態" "critical"
        fi
    fi

    sleep 30  # 每30秒檢查一次
done

}

一次性健康檢查

health_check() {
log “執行一次性健康檢查…”

local health_status
if health_status=$(check_cluster_health); then
    local leader_count
    leader_count=$(echo "$health_status" | jq \'.leader_count\')

    local running_members
    running_members=$(echo "$health_status" | jq \'.running_members\')

    local total_members
    total_members=$(echo "$health_status" | jq \'.total_members\')

    echo "集群狀態摘要:"
    echo "  Leader 節點: $leader_count"
    echo "  運行節點: $running_members"
    echo "  總節點數: $total_members"

    if [[ "$leader_count" -eq 1 && "$running_members" -ge 2 ]]; then
        echo "  狀態: 健康"
        return 0
    else
        echo "  狀態: 異常"
        handle_cluster_issues "$health_status"
        return 1
    fi
else
    echo "無法獲取集群狀態"
    return 1
fi

}

顯示使用說明

show_usage() {
cat <<EOF
用法: $0 [COMMAND]

命令:
monitor – 持續監控集群狀態 (默認)
check – 執行一次性健康檢查
failover – 執行手動故障轉移
help – 顯示此幫助信息

範例:
$0 monitor # 開始持續監控
$0 check # 執行健康檢查
$0 failover target-node # 故障轉移到指定節點
EOF
}

主函數

main() {
local command=”${1:-monitor}”

case "$command" in
    "monitor")
        monitor_cluster
        ;;
    "check")
        health_check
        ;;
    "failover")
        local target="${2:-}"
        if [[ -z "$target" ]]; then
            echo "錯誤: 請指定故障轉移目標節點"
            exit 1
        fi

        local patroni_host
        if patroni_host=$(find_available_patroni); then
            perform_manual_failover "$target" "$patroni_host"
        else
            echo "錯誤: 無法找到可用的 Patroni API 端點"
            exit 1
        fi
        ;;
    "help"|"-h"|"--help")
        show_usage
        ;;
    *)
        echo "錯誤: 未知命令 \'$command\'"
        show_usage
        exit 1
        ;;
esac

}

腳本入口

if [[ “${BASH_SOURCE[0]}” == “${0}” ]]; then
main “$@”
fi

5. 自動化部署

Ansible Playbook 範例

# patroni-setup.yml
- hosts: postgres_cluster
  become: yes
  vars:
    postgres_version: 14
    patroni_version: 2.1.4

  tasks:
    - name: Install PostgreSQL and Patroni
      pip:
        name:
          - patroni[etcd]=={{ patroni_version }}
          - psycopg2-binary

    - name: Configure Patroni
      template:
        src: patroni.yml.j2
        dest: /etc/patroni.yml
      notify: restart patroni

    - name: Start Patroni service
      systemd:
        name: patroni
        enabled: yes
        state: started

6. 備份自動化

#!/bin/bash
# 自動化備份腳本

# WAL-E 連續歸檔
export WALE_S3_PREFIX="s3://your-backup-bucket/wal-e"
export AWS_ACCESS_KEY_ID="your-access-key"
export AWS_SECRET_ACCESS_KEY="your-secret-key"

# 每日全量備份
0 2 * * * /usr/local/bin/wal-e backup-push /var/lib/postgresql/data

# 清理舊備份 (保留30天)
0 3 * * 0 /usr/local/bin/wal-e delete --confirm retain 30

7. 完整監控 Dashboard

Grafana Dashboard 關鍵面板:
– Patroni 集群拓撲
– PostgreSQL 連接數趨勢
– 複製延遲監控
– 磁盤 I/O 和空間使用
– 查詢性能統計
– 錯誤日誌統計

8. 運維自動化 Cron 任務

# /etc/crontab
# 每分鐘健康檢查
* * * * * postgres /opt/scripts/pg_health_check.sh

# 每5分鐘檢查複製狀態
*/5 * * * * postgres /opt/scripts/check_replication.sh

# 每小時性能統計
0 * * * * postgres /opt/scripts/performance_stats.sh

# 每日備份驗證
0 6 * * * postgres /opt/scripts/verify_backup.sh

這套完整的方案可以確保 Patroni 同步複製環境的高可用性,通過多層次監控和自動化來達到 99.99% 的 SLA 目標。需要我針對某個特定部分提供更詳細的配置嗎?

留言

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *