DB HA
Step by Step PostgreSQL High Availability Setup using Patroni & Etcd & HAProxy for Debian (Ubuntu)
以下
192.168.1.65 -ha ip
192.168.1.67 -db1 ip
192.168.1.69 -db2 ip
一開始有廢話的
sudo vi /etc/hosts
192.168.1.67 pg-patroni01
192.168.1.69 pg-patroni01
192.168.1.65 pg-proxy
work機
node-bash
sudo hostnamectl set-hostname pg-patroni02
sudo vi /etc/hosts
sudo sh -c \'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list\'
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt-get update
sudo apt -y install postgresql-15 postgresql-server-dev-15
sudo ln -s /usr/lib/postgresql/15/bin/* /usr/sbin/
sudo -u postgres psql # 改密碼
systemctl stop postgresql.service
systemctl status postgresql.service
sudo apt -y install python3 python3-pip
sudo apt install python3-pip python3-dev libpq-dev -y
sudo -H pip3 install psycopg2 --break-system-packages
sudo -H pip3 install patroni --break-system-packages
sudo -H pip3 install python-etcd --break-system-packages
sudo mkdir -p /data/patroni
sudo chown postgres:postgres /data/patroni/
sudo chmod 700 /data/patroni/
sudo vi /etc/patroni.yml
sudo vi /etc/systemd/system/patroni.service
sudo systemctl daemon-reload
sudo systemctl enable patroni
sudo systemctl enable postgresql
sudo systemctl start patroni
sudo systemctl start postgresql
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/systemd/system/patroni.service
sudo vi /etc/patroni.yml
sudo systemctl daemon-reload
sudo systemctl enable patroni
sudo systemctl enable postgresql
sudo systemctl start patroni
sudo systemctl start postgresql
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/patroni.yml
sudo ss -ltnp | grep 5432
journalctl -u patroni -f
sudo vi /etc/systemd/system/etcd.service
sudo systemctl daemon-reload
sudo systemctl enable etcd
sudo systemctl start etcd
curl http://192.168.1.69:2379/v2/keys/
sudo patronictl -c /etc/patroni.yml list
sudo systemctl restart patroni
curl http://192.168.1.67:8008/readiness
sudo journalctl -u patroni -f
psql -h 192.168.1.65 -p 5000 -U postgres -W
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/patroni.yml
“` yml=
scope: postgres
namespace: /db/
name: patronidb2
restapi:
listen: 192.168.1.69:8008
connect_address: 192.168.1.69:8008
etcd3:
host: 192.168.1.65:2379
bootstrap:
dcs:
ttl: 30
loop_wait: 10
retry_timeout: 10
maximum_lag_on_failover: 1048576
postgresql:
use_pg_rewind: true
initdb:
– encoding: UTF8
– data-checksums
pg_hba:
– host replication replicator 127.0.0.1/32 md5
– host replication replicator 192.168.1.69/0 md5
– host replication replicator 192.168.1.67/0 md5
– host all all 0.0.0.0/0 md5
users:
admin:
password: admin
options:
– createrole
– createdb
postgresql:
listen: 192.168.1.69:5432
connect_address: 192.168.1.69:5432
data_dir: /data/patroni
pgpass: /tmp/pgpass
authentication:
replication:
username: replicator
password: replicator2024
superuser:
username: postgres
password: postgres2024
parameters:
unix_socket_directories: \’.\’
tags:
nofailover: false
noloadbalance: false
clonefrom: false
nosync: false
#### sudo vi /etc/systemd/system/patroni.service
``` yml=
[Unit]
Description=Runners to orchestrate a high-availability PostgreSQL
After=syslog.target network.target
[Service]
Type=simple
User=postgres
Group=postgres
ExecStart=/usr/local/bin/patroni /etc/patroni.yml
KillMode=process
TimeoutSec=30
Restart=no
[Install]
WantedBy=multi-user.target
sudo vi /etc/systemd/system/etcd.service
“`yaml=
[Unit]
Description=etcd key-value store
After=network.target
[Service]
Type=notify
ExecStart=/usr/local/bin/etcd \
–name node1 \
–data-dir /var/lib/etcd \
–listen-peer-urls http://192.168.1.69:2380 \
–listen-client-urls http://192.168.1.69:2379,http://127.0.0.1:2379 \
–advertise-client-urls http://192.168.1.69:2379 \
–initial-advertise-peer-urls http://192.168.1.69:2380 \
–initial-cluster node1=http://192.168.1.69:2380 \
–initial-cluster-token etcd-cluster \
–initial-cluster-state new \
–enable-v2=true
Restart=always
RestartSec=5
LimitNOFILE=40000
[Install]
WantedBy=multi-user.target
#### 改帳密
```sql=
# psql
postgres=# ALTER USER postgres PASSWORD \'postgres2024\';
postgres=# CREATE USER replicator WITH ENCRYPTED PASSWORD \'replicator2024\';
ha
bash
sudo hostnamectl set-hostname pg-proxy
sudo vi /etc/hosts
sudo apt-get update
sudo apt update && sudo apt upgrade -y
sudo apt install -y etcd-server haproxy
sudo vi /etc/default/etcd
sudo systemctl restart etcd
sudo systemctl status etcd
sudo vi /etc/haproxy/haproxy.cfg
sudo systemctl start haproxy
sudo systemctl enable haproxy
sudo systemctl restart haproxy
sudo systemctl status haproxy
sudo haproxy -c -V -f /etc/haproxy/haproxy.cfg
sudo netstat -tulnp | grep 5000
sudo ufw allow 5000/tcp
curl http://192.168.1.67:8008/health
sudo journalctl -u haproxy -f
curl http://192.168.1.65:2379/v2/keys/
sudo systemctl restart haproxy
sudo systemctl status haproxy
curl -v http://192.168.1.67:8008/readiness
sudo journalctl -u haproxy -f
sudo systemctl status haproxy
history
sudo vi /etc/default/etcd
ETCD_LISTEN_PEER_URLS="http://192.168.1.65:2380,http://127.0.0.1:7001"
ETCD_LISTEN_CLIENT_URLS="http://127.0.0.1:2379, http://192.168.1.65:2379"
ETCD_INITIAL_ADVERTISE_PEER_URLS="http://192.168.1.65:2380"
ETCD_INITIAL_CLUSTER="etcd0=http://192.168.1.65:2380,"
ETCD_ADVERTISE_CLIENT_URLS="http://192.168.1.65:2379"
ETCD_INITIAL_CLUSTER_TOKEN="cluster1"
ETCD_INITIAL_CLUSTER_STATE="new"
ETCD_ENABLE_V2="true"
sudo vi /etc/haproxy/haproxy.cfg
global
maxconn 4000
defaults
log global
mode tcp
retries 2
timeout connect 4s
timeout client 30m
timeout server 30m
timeout check 5s
listen stats
mode http
bind 192.168.1.65:7000
stats enable
stats uri /
listen postgres
bind *:5000
option httpchk
http-check expect status 200
default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server node1 192.168.1.67:5432 maxconn 1000 check port 8008
server node2 192.168.1.69:5432 maxconn 1000 check port 8008
```




## 去除廢話版本
### work機
#### bash
```bash=
sudo sh -c \'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list\'
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt-get update
sudo apt -y install postgresql-15 postgresql-server-dev-15
sudo ln -s /usr/lib/postgresql/15/bin/* /usr/sbin/
sudo -u postgres psql
systemctl stop postgresql.service
systemctl status postgresql.service
sudo apt -y install python3 python3-pip
sudo apt install python3-pip python3-dev libpq-dev -y
sudo -H pip3 install psycopg2 --break-system-packages
sudo -H pip3 install patroni --break-system-packages
sudo -H pip3 install python-etcd --break-system-packages
sudo mkdir -p /data/patroni
sudo chown postgres:postgres /data/patroni/
sudo chmod 700 /data/patroni/
sudo vi /etc/patroni.yml
sudo vi /etc/systemd/system/patroni.service
sudo systemctl daemon-reload
sudo systemctl enable patroni
sudo systemctl enable postgresql
sudo systemctl start patroni
sudo systemctl start postgresql
sudo patronictl -c /etc/patroni.yml list
sudo vi /etc/systemd/system/etcd.service
sudo patronictl -c /etc/patroni.yml list
sudo -u postgres psql
ALTER USER postgres PASSWORD \'postgres2024\';
CREATE USER replicator WITH ENCRYPTED PASSWORD \'replicator2024\';
sudo vi /etc/patroni.yml
scope: postgres
namespace: /db/
name: patronidb2
restapi:
listen: 192.168.1.74:8008
connect_address: 192.168.1.74:8008
etcd3:
host: 192.168.1.73:2379
bootstrap:
dcs:
ttl: 30
loop_wait: 10
retry_timeout: 10
maximum_lag_on_failover: 1048576
postgresql:
use_pg_rewind: true
initdb:
- encoding: UTF8
- data-checksums
pg_hba:
- host replication replicator 127.0.0.1/32 md5
- host replication replicator 192.168.1.74/0 md5
- host replication replicator 192.168.1.57/0 md5
- host all all 0.0.0.0/0 md5
users:
admin:
password: admin
options:
- createrole
- createdb
postgresql:
listen: 192.168.1.74:5432
connect_address: 192.168.1.74:5432
data_dir: /data/patroni
pgpass: /tmp/pgpass
authentication:
replication:
username: replicator
password: replicator2024
superuser:
username: postgres
password: postgres2024
parameters:
unix_socket_directories: \'.\'
tags:
nofailover: false
noloadbalance: false
clonefrom: false
nosync: false
sudo vi /etc/systemd/system/patroni.service
[Unit]
Description=Runners to orchestrate a high-availability PostgreSQL
After=syslog.target network.target
[Service]
Type=simple
User=postgres
Group=postgres
ExecStart=/usr/local/bin/patroni /etc/patroni.yml
KillMode=process
TimeoutSec=30
Restart=no
[Install]
WantedBy=multi-user.target
sudo vi /etc/systemd/system/etcd.service
[Unit]
Description=etcd key-value store
After=network.target
[Service]
Type=notify
ExecStart=/usr/local/bin/etcd \\
--name node1 \\
--data-dir /var/lib/etcd \\
--listen-peer-urls http://192.168.1.74:2380 \\
--listen-client-urls http://192.168.1.74:2379,http://127.0.0.1:2379 \\
--advertise-client-urls http://192.168.1.74:2379 \\
--initial-advertise-peer-urls http://192.168.1.74:2380 \\
--initial-cluster node1=http://192.168.1.74:2380 \\
--initial-cluster-token etcd-cluster \\
--initial-cluster-state new \\
--enable-v2=true
Restart=always
RestartSec=5
LimitNOFILE=40000
[Install]
WantedBy=multi-user.target
ha機
bash
“`bash=
sudo apt-get update
sudo apt update && sudo apt upgrade -y
sudo apt install -y etcd-server haproxy
sudo vi /etc/default/etcd
sudo vi /etc/haproxy/haproxy.cfg
sudo systemctl restart etcd
sudo systemctl status etcd
sudo systemctl start haproxy
sudo systemctl enable haproxy
sudo systemctl restart haproxy
sudo systemctl status haproxy
sudo haproxy -c -V -f /etc/haproxy/haproxy.cfg
sudo netstat -tulnp | grep 5000
sudo ufw allow 5000/tcp
sudo systemctl status haproxy
sudo vi /etc/default/etcd
ETCD_LISTEN_PEER_URLS=”http://192.168.1.73:2380,http://127.0.0.1:7001″
ETCD_LISTEN_CLIENT_URLS=”http://127.0.0.1:2379, http://192.168.1.73:2379″
ETCD_INITIAL_ADVERTISE_PEER_URLS=”http://192.168.1.73:2380″
ETCD_INITIAL_CLUSTER=”etcd0=http://192.168.1.73:2380,”
ETCD_ADVERTISE_CLIENT_URLS=”http://192.168.1.73:2379″
ETCD_INITIAL_CLUSTER_TOKEN=”cluster1″
ETCD_INITIAL_CLUSTER_STATE=”new”
ETCD_ENABLE_V2=”true”
sudo vi /etc/haproxy/haproxy.cfg
global
maxconn 4000
defaults
log global
mode tcp
retries 2
timeout connect 4s
timeout client 30m
timeout server 30m
timeout check 5s
listen stats
mode http
bind 192.168.1.73:7000
stats enable
stats uri /
listen postgres
bind *:5000
option httpchk
http-check expect status 200
default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server node1 192.168.1.57:5432 maxconn 1000 check port 8008
server node2 192.168.1.74:5432 maxconn 1000 check port 8008

> **INFO:** ## Patroni 同步複製完整實施方案
### 1. Patroni 集群架構
```yaml
# patroni.yml 關鍵配置
postgresql:
parameters:
synchronous_commit: \'on\'
synchronous_standby_names: \'ANY 1 (replica1, replica2)\'
max_wal_senders: 10
wal_level: replica
hot_standby: on
bootstrap:
dcs:
ttl: 30
loop_wait: 10
retry_timeout: 30
maximum_lag_on_failover: 1048576
synchronous_mode: true
synchronous_mode_strict: true
節點配置:
– Primary: 主寫入節點
– Replica1: 同步備份(同機房)
– Replica2: 同步備份(異機房)
– etcd 集群: 3 節點分散式配置存儲
2. 監控系統架構
核心監控組件
Prometheus + Grafana 棧:
# docker-compose.yml
services:
prometheus:
image: prom/prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=secure_password
postgres_exporter:
image: prometheuscommunity/postgres-exporter
environment:
- DATA_SOURCE_NAME=postgresql://monitor:password@postgres:5432/postgres?sslmode=disable
關鍵監控指標:
– pg_up: 數據庫可用性
– pg_replication_lag: 複製延遲
– pg_stat_replication: 複製狀態
– patroni_master: 主節點狀態
– patroni_replica_lag_in_seconds: Patroni 複製延遲
預警規則配置
# prometheus rules
groups:
- name: postgresql
rules:
- alert: PostgreSQLDown
expr: pg_up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "PostgreSQL 實例宕機"
- alert: ReplicationLag
expr: pg_replication_lag > 100
for: 2m
labels:
severity: warning
annotations:
summary: "複製延遲超過 100MB"
- alert: PatroniMasterMissing
expr: absent(patroni_master == 1)
for: 1m
labels:
severity: critical
annotations:
summary: "Patroni 主節點缺失"
3. 日誌監控
ELK Stack 配置:
# logstash pipeline
input {
file {
path => "/var/log/postgresql/*.log"
type => "postgresql"
}
file {
path => "/var/log/patroni/*.log"
type => "patroni"
}
}
filter {
if [type] == "postgresql" {
grok {
match => { "message" => "%{TIMESTAMP_ISO8601:timestamp} \\[%{INT:pid}\\] %{WORD:level}: %{GREEDYDATA:msg}" }
}
}
}
4. 自動化腳本
健康檢查腳本
!/bin/bash
PostgreSQL HA 健康檢查腳本
檢查 Patroni 集群狀態和複製健康度
set -euo pipefail
配置變數
PATRONI_HOST=”localhost”
PATRONI_PORT=”8008″
PG_HOST=”localhost”
PG_PORT=”5432″
PG_USER=”postgres”
ALERT_THRESHOLD_LAG=100 # MB
WEBHOOK_URL=”https://hooks.slack.com/your-webhook”
日誌函數
log() {
echo “[$(date \’+%Y-%m-%d %H:%M:%S\’)] $1” | tee -a /var/log/pg-health-check.log
}
檢查 Patroni 集群狀態
check_patroni_cluster() {
log “檢查 Patroni 集群狀態…”
local status
status=$(curl -s "http://${PATRONI_HOST}:${PATRONI_PORT}/cluster" || echo "ERROR")
if [[ "$status" == "ERROR" ]]; then
log "ERROR: 無法連接到 Patroni API"
send_alert "Patroni API 不可達" "critical"
return 1
fi
# 解析集群狀態
local leader_count
leader_count=$(echo "$status" | jq \'.members | map(select(.role == "Leader")) | length\')
if [[ "$leader_count" -ne 1 ]]; then
log "ERROR: 集群中有 $leader_count 個 Leader"
send_alert "Patroni 集群狀態異常: ${leader_count} 個 Leader" "critical"
return 1
fi
log "Patroni 集群狀態正常"
return 0
}
檢查複製延遲
check_replication_lag() {
log “檢查複製延遲…”
local query="SELECT client_addr, state, pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn)) as lag FROM pg_stat_replication;"
local result
result=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -t -c "$query" 2>/dev/null || echo "ERROR")
if [[ "$result" == "ERROR" ]]; then
log "ERROR: 無法查詢複製狀態"
send_alert "無法查詢 PostgreSQL 複製狀態" "critical"
return 1
fi
# 檢查每個複製節點的延遲
while IFS=\'|\' read -r client_addr state lag; do
client_addr=$(echo "$client_addr" | xargs)
state=$(echo "$state" | xargs)
lag=$(echo "$lag" | xargs)
if [[ "$state" != "streaming" ]]; then
log "WARNING: 複製節點 $client_addr 狀態異常: $state"
send_alert "複製節點 $client_addr 狀態: $state" "warning"
fi
# 轉換延遲為 MB
local lag_mb
if [[ "$lag" =~ ([0-9]+)\\ MB ]]; then
lag_mb=${BASH_REMATCH[1]}
if [[ "$lag_mb" -gt "$ALERT_THRESHOLD_LAG" ]]; then
log "WARNING: 複製節點 $client_addr 延遲過大: ${lag_mb}MB"
send_alert "複製延遲告警: $client_addr 延遲 ${lag_mb}MB" "warning"
fi
fi
log "複製節點 $client_addr: $state, 延遲: $lag"
done <<< "$result"
return 0
}
檢查數據庫連接
check_db_connection() {
log “檢查數據庫連接…”
local result
result=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -c "SELECT 1;" -t 2>/dev/null || echo "ERROR")
if [[ "$result" == "ERROR" ]]; then
log "ERROR: 無法連接到 PostgreSQL"
send_alert "PostgreSQL 數據庫連接失敗" "critical"
return 1
fi
log "數據庫連接正常"
return 0
}
檢查磁盤空間
check_disk_space() {
log “檢查磁盤空間…”
local data_dir
data_dir=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -t -c "SHOW data_directory;" | xargs)
local usage
usage=$(df "$data_dir" | awk \'NR==2 {print $5}\' | sed \'s/%//\')
if [[ "$usage" -gt 85 ]]; then
log "WARNING: 數據目錄磁盤使用率: ${usage}%"
send_alert "磁盤空間告警: 使用率 ${usage}%" "warning"
elif [[ "$usage" -gt 95 ]]; then
log "CRITICAL: 數據目錄磁盤使用率: ${usage}%"
send_alert "磁盤空間嚴重告警: 使用率 ${usage}%" "critical"
fi
log "磁盤使用率: ${usage}%"
}
發送告警
send_alert() {
local message=”$1″
local severity=”$2″
local payload
payload=$(cat <<EOF
{
“text”: “PostgreSQL HA Alert”,
“attachments”: [
{
“color”: “$([[ “$severity” == “critical” ]] && echo “danger” || echo “warning”)”,
“fields”: [
{
“title”: “Severity”,
“value”: “$severity”,
“short”: true
},
{
“title”: “Message”,
“value”: “$message”,
“short”: false
},
{
“title”: “Host”,
“value”: “$(hostname)”,
“short”: true
},
{
“title”: “Time”,
“value”: “$(date)”,
“short”: true
}
]
}
]
}
EOF
)
curl -X POST -H \'Content-type: application/json\' --data "$payload" "$WEBHOOK_URL" || true
}
生成健康報告
generate_health_report() {
log “生成健康檢查報告…”
local report_file="/tmp/pg-health-report-$(date +%Y%m%d-%H%M%S).json"
# 收集各種狀態信息
local patroni_status
patroni_status=$(curl -s "http://${PATRONI_HOST}:${PATRONI_PORT}/cluster" || echo \'{}\')
local pg_stats
pg_stats=$(psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -c "
SELECT json_build_object(
\'connections\', (SELECT count(*) FROM pg_stat_activity),
\'database_size\', pg_size_pretty(pg_database_size(current_database())),
\'uptime\', date_trunc(\'second\', now() - pg_postmaster_start_time()),
\'version\', version()
);" -t | xargs 2>/dev/null || echo \'{}\')
# 生成 JSON 報告
cat > "$report_file" <<EOF
{
“timestamp”: “$(date -Iseconds)”,
“hostname”: “$(hostname)”,
“patroni_cluster”: $patroni_status,
“postgresql_stats”: $pg_stats,
“disk_usage”: “$(df /var/lib/postgresql/data | awk \’NR==2 {print $5}\’ 2>/dev/null || echo \’N/A\’)”
}
EOF
log "健康報告已生成: $report_file"
}
主執行函數
main() {
log “開始 PostgreSQL HA 健康檢查…”
local exit_code=0
check_db_connection || exit_code=1
check_patroni_cluster || exit_code=1
check_replication_lag || exit_code=1
check_disk_space || exit_code=1
generate_health_report
if [[ $exit_code -eq 0 ]]; then
log "所有健康檢查通過"
else
log "健康檢查發現問題,請查看日誌詳情"
fi
return $exit_code
}
腳本入口
if [[ “${BASH_SOURCE[0]}” == “${0}” ]]; then
main “$@”
fi
自動故障轉移腳本
!/bin/bash
Patroni 自動故障轉移管理腳本
監控集群狀態並執行必要的故障轉移操作
set -euo pipefail
配置變數
PATRONI_HOSTS=(“patroni1:8008” “patroni2:8008” “patroni3:8008”)
FAILOVER_TIMEOUT=60
MAX_FAILOVER_ATTEMPTS=3
NOTIFICATION_WEBHOOK=”https://hooks.slack.com/your-webhook”
LOG_FILE=”/var/log/patroni-failover.log”
日誌函數
log() {
echo “[$(date \’+%Y-%m-%d %H:%M:%S\’)] $1” | tee -a “$LOG_FILE”
}
發送通知
send_notification() {
local title=”$1″
local message=”$2″
local severity=”${3:-info}”
local color="good"
case "$severity" in
"critical") color="danger" ;;
"warning") color="warning" ;;
esac
local payload
payload=$(cat <<EOF
{
“text”: “$title”,
“attachments”: [
{
“color”: “$color”,
“fields”: [
{
“title”: “Message”,
“value”: “$message”,
“short”: false
},
{
“title”: “Host”,
“value”: “$(hostname)”,
“short”: true
},
{
“title”: “Time”,
“value”: “$(date)”,
“short”: true
}
]
}
]
}
EOF
)
curl -X POST -H \'Content-type: application/json\' --data "$payload" "$NOTIFICATION_WEBHOOK" &>/dev/null || true
}
獲取集群狀態
get_cluster_status() {
local host=”$1″
local status
status=$(curl -s --connect-timeout 5 "http://$host/cluster" 2>/dev/null || echo "ERROR")
if [[ "$status" == "ERROR" ]]; then
return 1
fi
echo "$status"
return 0
}
查找可用的 Patroni 節點
find_available_patroni() {
for host in “${PATRONI_HOSTS[@]}”; do
if get_cluster_status “$host” &>/dev/null; then
echo “$host”
return 0
fi
done
return 1
}
檢查集群健康狀態
check_cluster_health() {
local patroni_host
if ! patroni_host=$(find_available_patroni); then
log “ERROR: 無法連接到任何 Patroni 節點”
return 1
fi
local status
if ! status=$(get_cluster_status "$patroni_host"); then
log "ERROR: 無法獲取集群狀態"
return 1
fi
# 解析集群狀態
local leader_count
leader_count=$(echo "$status" | jq \'.members | map(select(.role == "Leader")) | length\' 2>/dev/null || echo "0")
local total_members
total_members=$(echo "$status" | jq \'.members | length\' 2>/dev/null || echo "0")
local running_members
running_members=$(echo "$status" | jq \'.members | map(select(.state == "running")) | length\' 2>/dev/null || echo "0")
# 輸出狀態摘要
cat <<EOF
{
“leader_count”: $leader_count,
“total_members”: $total_members,
“running_members”: $running_members,
“status”: $status
}
EOF
return 0
}
執行手動故障轉移
perform_manual_failover() {
local target_node=”$1″
local patroni_host=”$2″
log "執行手動故障轉移到節點: $target_node"
local result
result=$(curl -s -X POST "http://$patroni_host/failover" \\
-H "Content-Type: application/json" \\
-d "{\\"leader\\": \\"$target_node\\"}" 2>/dev/null || echo "ERROR")
if [[ "$result" == "ERROR" ]]; then
log "ERROR: 故障轉移請求失敗"
return 1
fi
log "故障轉移請求已提交,等待完成..."
sleep 10
# 驗證故障轉移結果
local attempts=0
while [[ $attempts -lt $FAILOVER_TIMEOUT ]]; do
local health_status
if health_status=$(check_cluster_health); then
local leader_count
leader_count=$(echo "$health_status" | jq \'.leader_count\')
if [[ "$leader_count" -eq 1 ]]; then
log "故障轉移成功完成"
send_notification "故障轉移完成" "集群已成功轉移到新的主節點" "info"
return 0
fi
fi
sleep 2
((attempts += 2))
done
log "ERROR: 故障轉移在 $FAILOVER_TIMEOUT 秒內未完成"
return 1
}
自動選擇最佳故障轉移目標
select_failover_target() {
local status=”$1″
# 查找狀態為 running 的 replica 節點
local target
target=$(echo "$status" | jq -r \'.members[] | select(.role == "Replica" and .state == "running") | .name\' | head -n1)
if [[ -n "$target" && "$target" != "null" ]]; then
echo "$target"
return 0
fi
return 1
}
處理集群異常狀態
handle_cluster_issues() {
local health_status=”$1″
local leader_count
leader_count=$(echo "$health_status" | jq \'.leader_count\')
local running_members
running_members=$(echo "$health_status" | jq \'.running_members\')
local total_members
total_members=$(echo "$health_status" | jq \'.total_members\')
# 沒有 Leader 的情況
if [[ "$leader_count" -eq 0 ]]; then
log "CRITICAL: 集群中沒有 Leader 節點"
send_notification "集群異常" "沒有可用的 Leader 節點" "critical"
if [[ "$running_members" -gt 0 ]]; then
log "嘗試從 $running_members 個運行中的節點選擇新 Leader"
local status
status=$(echo "$health_status" | jq \'.status\')
local target
if target=$(select_failover_target "$status"); then
log "選擇節點 $target 作為新 Leader"
local patroni_host
if patroni_host=$(find_available_patroni); then
perform_manual_failover "$target" "$patroni_host"
else
log "ERROR: 無法找到可用的 Patroni API 端點"
fi
else
log "ERROR: 無法找到合適的故障轉移目標"
fi
else
log "CRITICAL: 沒有運行中的節點可以提升為 Leader"
send_notification "集群嚴重異常" "所有節點都不可用" "critical"
fi
return 1
fi
# 多個 Leader 的情況(腦裂)
if [[ "$leader_count" -gt 1 ]]; then
log "CRITICAL: 檢測到腦裂狀態,有 $leader_count 個 Leader"
send_notification "集群腦裂" "檢測到 $leader_count 個 Leader 節點" "critical"
# 在腦裂情況下,通常需要人工介入
log "腦裂狀況需要人工介入處理"
return 1
fi
# 節點數量不足
local min_nodes=2
if [[ "$running_members" -lt $min_nodes ]]; then
log "WARNING: 運行中的節點數量不足 ($running_members < $min_nodes)"
send_notification "集群節點不足" "只有 $running_members 個節點在運行" "warning"
fi
return 0
}
主監控循環
monitor_cluster() {
log “開始監控 Patroni 集群…”
local consecutive_failures=0
local max_consecutive_failures=3
while true; do
local health_status
if health_status=$(check_cluster_health); then
consecutive_failures=0
local leader_count
leader_count=$(echo "$health_status" | jq \'.leader_count\')
local running_members
running_members=$(echo "$health_status" | jq \'.running_members\')
log "集群狀態正常: $leader_count Leader, $running_members 個運行節點"
# 檢查是否需要處理異常狀態
if [[ "$leader_count" -ne 1 ]] || [[ "$running_members" -lt 2 ]]; then
handle_cluster_issues "$health_status"
fi
else
((consecutive_failures++))
log "無法獲取集群狀態 (連續失敗: $consecutive_failures)"
if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
log "CRITICAL: 連續 $consecutive_failures 次無法獲取集群狀態"
send_notification "監控異常" "連續 $consecutive_failures 次無法獲取集群狀態" "critical"
fi
fi
sleep 30 # 每30秒檢查一次
done
}
一次性健康檢查
health_check() {
log “執行一次性健康檢查…”
local health_status
if health_status=$(check_cluster_health); then
local leader_count
leader_count=$(echo "$health_status" | jq \'.leader_count\')
local running_members
running_members=$(echo "$health_status" | jq \'.running_members\')
local total_members
total_members=$(echo "$health_status" | jq \'.total_members\')
echo "集群狀態摘要:"
echo " Leader 節點: $leader_count"
echo " 運行節點: $running_members"
echo " 總節點數: $total_members"
if [[ "$leader_count" -eq 1 && "$running_members" -ge 2 ]]; then
echo " 狀態: 健康"
return 0
else
echo " 狀態: 異常"
handle_cluster_issues "$health_status"
return 1
fi
else
echo "無法獲取集群狀態"
return 1
fi
}
顯示使用說明
show_usage() {
cat <<EOF
用法: $0 [COMMAND]
命令:
monitor – 持續監控集群狀態 (默認)
check – 執行一次性健康檢查
failover – 執行手動故障轉移
help – 顯示此幫助信息
範例:
$0 monitor # 開始持續監控
$0 check # 執行健康檢查
$0 failover target-node # 故障轉移到指定節點
EOF
}
主函數
main() {
local command=”${1:-monitor}”
case "$command" in
"monitor")
monitor_cluster
;;
"check")
health_check
;;
"failover")
local target="${2:-}"
if [[ -z "$target" ]]; then
echo "錯誤: 請指定故障轉移目標節點"
exit 1
fi
local patroni_host
if patroni_host=$(find_available_patroni); then
perform_manual_failover "$target" "$patroni_host"
else
echo "錯誤: 無法找到可用的 Patroni API 端點"
exit 1
fi
;;
"help"|"-h"|"--help")
show_usage
;;
*)
echo "錯誤: 未知命令 \'$command\'"
show_usage
exit 1
;;
esac
}
腳本入口
if [[ “${BASH_SOURCE[0]}” == “${0}” ]]; then
main “$@”
fi
5. 自動化部署
Ansible Playbook 範例
# patroni-setup.yml
- hosts: postgres_cluster
become: yes
vars:
postgres_version: 14
patroni_version: 2.1.4
tasks:
- name: Install PostgreSQL and Patroni
pip:
name:
- patroni[etcd]=={{ patroni_version }}
- psycopg2-binary
- name: Configure Patroni
template:
src: patroni.yml.j2
dest: /etc/patroni.yml
notify: restart patroni
- name: Start Patroni service
systemd:
name: patroni
enabled: yes
state: started
6. 備份自動化
#!/bin/bash
# 自動化備份腳本
# WAL-E 連續歸檔
export WALE_S3_PREFIX="s3://your-backup-bucket/wal-e"
export AWS_ACCESS_KEY_ID="your-access-key"
export AWS_SECRET_ACCESS_KEY="your-secret-key"
# 每日全量備份
0 2 * * * /usr/local/bin/wal-e backup-push /var/lib/postgresql/data
# 清理舊備份 (保留30天)
0 3 * * 0 /usr/local/bin/wal-e delete --confirm retain 30
7. 完整監控 Dashboard
Grafana Dashboard 關鍵面板:
– Patroni 集群拓撲
– PostgreSQL 連接數趨勢
– 複製延遲監控
– 磁盤 I/O 和空間使用
– 查詢性能統計
– 錯誤日誌統計
8. 運維自動化 Cron 任務
# /etc/crontab
# 每分鐘健康檢查
* * * * * postgres /opt/scripts/pg_health_check.sh
# 每5分鐘檢查複製狀態
*/5 * * * * postgres /opt/scripts/check_replication.sh
# 每小時性能統計
0 * * * * postgres /opt/scripts/performance_stats.sh
# 每日備份驗證
0 6 * * * postgres /opt/scripts/verify_backup.sh
這套完整的方案可以確保 Patroni 同步複製環境的高可用性,通過多層次監控和自動化來達到 99.99% 的 SLA 目標。需要我針對某個特定部分提供更詳細的配置嗎?
發佈留言