Skip to content

Commit 16a2cec

Browse files
committed
Allow failover for replicas
1 parent 0ee59c0 commit 16a2cec

File tree

6 files changed

+174
-79
lines changed

6 files changed

+174
-79
lines changed

CONFIG.md

+9
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,15 @@ If the client doesn't specify, PgCat routes traffic to this role by default.
309309
`replica` round-robin between replicas only without touching the primary,
310310
`primary` all queries go to the primary unless otherwise specified.
311311

312+
### replica_to_primary_failover_enabled
313+
```
314+
path: pools.<pool_name>.replica_to_primary_failover_enabled
315+
default: "false"
316+
```
317+
318+
If set to true, when the specified role is `replica` (either by setting `default_role` or manually)
319+
and all replicas are banned, queries will be sent to the primary (until a replica is back online).
320+
312321
### prepared_statements_cache_size
313322
```
314323
path: general.prepared_statements_cache_size

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ The setting will persist until it's changed again or the client disconnects.
175175
By default, all queries are routed to the first available server; `default_role` setting controls this behavior.
176176

177177
### Failover
178-
All servers are checked with a `;` (very fast) query before being given to a client. Additionally, the server health is monitored with every client query that it processes. If the server is not reachable, it will be banned and cannot serve any more transactions for the duration of the ban. The queries are routed to the remaining servers. If all servers become banned, the ban list is cleared: this is a safety precaution against false positives. The primary can never be banned.
178+
All servers are checked with a `;` (very fast) query before being given to a client. Additionally, the server health is monitored with every client query that it processes. If the server is not reachable, it will be banned and cannot serve any more transactions for the duration of the ban. The queries are routed to the remaining servers. If `replica_to_primary_failover_enabled` is set to true and all replicas become banned, the query will be routed to the primary. If `replica_to_primary_failover_enabled` is false and all servers (replicas) become banned, the ban list is cleared: this is a safety precaution against false positives. The primary can never be banned.
179179

180180
The ban time can be changed with `ban_time`. The default is 60 seconds.
181181

src/config.rs

+4
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,9 @@ pub struct Pool {
541541
#[serde(default = "Pool::default_default_role")]
542542
pub default_role: String,
543543

544+
#[serde(default)] // False
545+
pub replica_to_primary_failover_enabled: bool,
546+
544547
#[serde(default)] // False
545548
pub query_parser_enabled: bool,
546549

@@ -734,6 +737,7 @@ impl Default for Pool {
734737
pool_mode: Self::default_pool_mode(),
735738
load_balancing_mode: Self::default_load_balancing_mode(),
736739
default_role: String::from("any"),
740+
replica_to_primary_failover_enabled: false,
737741
query_parser_enabled: false,
738742
query_parser_max_length: None,
739743
query_parser_read_write_splitting: false,

src/pool.rs

+37-14
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,9 @@ pub struct PoolSettings {
162162
// Default server role to connect to.
163163
pub default_role: Option<Role>,
164164

165+
// Whether or not we should use primary when replicas are unavailable
166+
pub replica_to_primary_failover_enabled: bool,
167+
165168
// Enable/disable query parser.
166169
pub query_parser_enabled: bool,
167170

@@ -219,6 +222,7 @@ impl Default for PoolSettings {
219222
user: User::default(),
220223
db: String::default(),
221224
default_role: None,
225+
replica_to_primary_failover_enabled: false,
222226
query_parser_enabled: false,
223227
query_parser_max_length: None,
224228
query_parser_read_write_splitting: false,
@@ -531,6 +535,8 @@ impl ConnectionPool {
531535
"primary" => Some(Role::Primary),
532536
_ => unreachable!(),
533537
},
538+
replica_to_primary_failover_enabled: pool_config
539+
.replica_to_primary_failover_enabled,
534540
query_parser_enabled: pool_config.query_parser_enabled,
535541
query_parser_max_length: pool_config.query_parser_max_length,
536542
query_parser_read_write_splitting: pool_config
@@ -731,6 +737,19 @@ impl ConnectionPool {
731737
});
732738
}
733739

740+
// If the role is replica and we allow sending traffic to primary when replicas are unavailble,
741+
// we add primary address at the end of the list of candidates, this way it will be tried when
742+
// replicas are all unavailable.
743+
if role == Role::Replica && self.settings.replica_to_primary_failover_enabled {
744+
let mut primaries = self
745+
.addresses
746+
.iter()
747+
.flatten()
748+
.filter(|address| address.role == Role::Primary)
749+
.collect::<Vec<&Address>>();
750+
candidates.insert(0, primaries.pop().unwrap());
751+
}
752+
734753
// Indicate we're waiting on a server connection from a pool.
735754
let now = Instant::now();
736755
client_stats.waiting();
@@ -935,24 +954,28 @@ impl ConnectionPool {
935954
return true;
936955
}
937956

938-
// Check if all replicas are banned, in that case unban all of them
939-
let replicas_available = self.addresses[address.shard]
940-
.iter()
941-
.filter(|addr| addr.role == Role::Replica)
942-
.count();
957+
// If we have replica to primary failover we should not unban replicas
958+
// as we still have the primary to server traffic.
959+
if !self.settings.replica_to_primary_failover_enabled {
960+
// Check if all replicas are banned, in that case unban all of them
961+
let replicas_available = self.addresses[address.shard]
962+
.iter()
963+
.filter(|addr| addr.role == Role::Replica)
964+
.count();
943965

944-
debug!("Available targets: {}", replicas_available);
966+
debug!("Available targets: {}", replicas_available);
945967

946-
let read_guard = self.banlist.read();
947-
let all_replicas_banned = read_guard[address.shard].len() == replicas_available;
948-
drop(read_guard);
968+
let read_guard = self.banlist.read();
969+
let all_replicas_banned = read_guard[address.shard].len() == replicas_available;
970+
drop(read_guard);
949971

950-
if all_replicas_banned {
951-
let mut write_guard = self.banlist.write();
952-
warn!("Unbanning all replicas.");
953-
write_guard[address.shard].clear();
972+
if all_replicas_banned {
973+
let mut write_guard = self.banlist.write();
974+
warn!("Unbanning all replicas.");
975+
write_guard[address.shard].clear();
954976

955-
return true;
977+
return true;
978+
}
956979
}
957980

958981
// Check if ban time is expired

src/query_router.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1459,6 +1459,7 @@ mod test {
14591459
load_balancing_mode: crate::config::LoadBalancingMode::Random,
14601460
shards: 2,
14611461
user: crate::config::User::default(),
1462+
replica_to_primary_failover_enabled: false,
14621463
default_role: Some(Role::Replica),
14631464
query_parser_enabled: true,
14641465
query_parser_max_length: None,
@@ -1538,6 +1539,7 @@ mod test {
15381539
shards: 5,
15391540
user: crate::config::User::default(),
15401541
default_role: Some(Role::Replica),
1542+
replica_to_primary_failover_enabled: false,
15411543
query_parser_enabled: true,
15421544
query_parser_max_length: None,
15431545
query_parser_read_write_splitting: true,

0 commit comments

Comments
 (0)