Skip to content

Commit 812de35

Browse files
dpkpjeffwidman
authored andcommitted
Retry bootstrapping after backoff when necessary (#1736)
The current client attempts to bootstrap once during initialization, but if it fails there is no second attempt and the client will be inoperable. This can happen, for example, if an entire cluster is down at the time a long-running client starts execution. This commit attempts to fix this by removing the synchronous bootstrapping from `KafkaClient` init, and instead merges bootstrap metadata with the cluster metadata. The Java client uses a similar approach. This allows us to continue falling back to bootstrap data when necessary throughout the life of a long-running consumer or producer. Fix #1670
1 parent 703f065 commit 812de35

File tree

3 files changed

+112
-129
lines changed

3 files changed

+112
-129
lines changed

kafka/client_async.py

+66-83
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class KafkaClient(object):
5555
5656
Keyword Arguments:
5757
bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
58-
strings) that the consumer should contact to bootstrap initial
58+
strings) that the client should contact to bootstrap initial
5959
cluster metadata. This does not have to be the full node list.
6060
It just needs to have at least one broker that will respond to a
6161
Metadata API Request. Default port is 9092. If no servers are
@@ -222,76 +222,34 @@ def __init__(self, **configs):
222222
self.config['metric_group_prefix'],
223223
weakref.proxy(self._conns))
224224

225-
self._bootstrap(collect_hosts(self.config['bootstrap_servers']))
225+
self._num_bootstrap_hosts = len(collect_hosts(self.config['bootstrap_servers']))
226226

227227
# Check Broker Version if not set explicitly
228228
if self.config['api_version'] is None:
229229
check_timeout = self.config['api_version_auto_timeout_ms'] / 1000
230230
self.config['api_version'] = self.check_version(timeout=check_timeout)
231231

232-
def _bootstrap(self, hosts):
233-
log.info('Bootstrapping cluster metadata from %s', hosts)
234-
# Exponential backoff if bootstrap fails
235-
backoff_ms = self.config['reconnect_backoff_ms'] * 2 ** self._bootstrap_fails
232+
def _can_bootstrap(self):
233+
effective_failures = self._bootstrap_fails // self._num_bootstrap_hosts
234+
backoff_factor = 2 ** effective_failures
235+
backoff_ms = min(self.config['reconnect_backoff_ms'] * backoff_factor,
236+
self.config['reconnect_backoff_max_ms'])
237+
238+
backoff_ms *= random.uniform(0.8, 1.2)
239+
236240
next_at = self._last_bootstrap + backoff_ms / 1000.0
237-
self._refresh_on_disconnects = False
238241
now = time.time()
239242
if next_at > now:
240-
log.debug("Sleeping %0.4f before bootstrapping again", next_at - now)
241-
time.sleep(next_at - now)
242-
self._last_bootstrap = time.time()
243-
244-
if self.config['api_version'] is None or self.config['api_version'] < (0, 10):
245-
if self.config['bootstrap_topics_filter']:
246-
metadata_request = MetadataRequest[0](list(self.config['bootstrap_topics_filter']))
247-
else:
248-
metadata_request = MetadataRequest[0]([])
249-
else:
250-
if self.config['bootstrap_topics_filter']:
251-
metadata_request = MetadataRequest[1](list(self.config['bootstrap_topics_filter']))
252-
else:
253-
metadata_request = MetadataRequest[1](None)
254-
255-
for host, port, afi in hosts:
256-
log.debug("Attempting to bootstrap via node at %s:%s", host, port)
257-
cb = functools.partial(WeakMethod(self._conn_state_change), 'bootstrap')
258-
bootstrap = BrokerConnection(host, port, afi,
259-
state_change_callback=cb,
260-
node_id='bootstrap',
261-
**self.config)
262-
if not bootstrap.connect_blocking():
263-
bootstrap.close()
264-
continue
265-
future = bootstrap.send(metadata_request)
266-
while not future.is_done:
267-
self._selector.select(1)
268-
for r, f in bootstrap.recv():
269-
f.success(r)
270-
if future.failed():
271-
bootstrap.close()
272-
continue
273-
self.cluster.update_metadata(future.value)
274-
log.info('Bootstrap succeeded: found %d brokers and %d topics.',
275-
len(self.cluster.brokers()), len(self.cluster.topics()))
276-
277-
# A cluster with no topics can return no broker metadata
278-
# in that case, we should keep the bootstrap connection
279-
if not len(self.cluster.brokers()):
280-
self._conns['bootstrap'] = bootstrap
281-
else:
282-
bootstrap.close()
283-
self._bootstrap_fails = 0
284-
break
285-
# No bootstrap found...
286-
else:
287-
log.error('Unable to bootstrap from %s', hosts)
288-
# Max exponential backoff is 2^12, x4000 (50ms -> 200s)
289-
self._bootstrap_fails = min(self._bootstrap_fails + 1, 12)
290-
self._refresh_on_disconnects = True
243+
return False
244+
return True
291245

292246
def _can_connect(self, node_id):
293247
if node_id not in self._conns:
294-
if self.cluster.broker_metadata(node_id):
248+
# cluster.broker_metadata() is stateful when called w/ 'bootstrap'
249+
# (it cycles through all of the bootstrap servers)
250+
# so we short-circuit here and assume that we should always have
251+
# some bootstrap_servers config to power bootstrap broker_metadata
252+
if node_id == 'bootstrap' or self.cluster.broker_metadata(node_id):
295253
return True
296254
return False
297255
conn = self._conns[node_id]
@@ -308,6 +266,9 @@ def _conn_state_change(self, node_id, conn):
308266
except KeyError:
309267
self._selector.modify(conn._sock, selectors.EVENT_WRITE)
310268

269+
if node_id == 'bootstrap':
270+
self._last_bootstrap = time.time()
271+
311272
elif conn.connected():
312273
log.debug("Node %s connected", node_id)
313274
if node_id in self._connecting:
@@ -323,12 +284,12 @@ def _conn_state_change(self, node_id, conn):
323284

324285
self._idle_expiry_manager.update(node_id)
325286

326-
if 'bootstrap' in self._conns and node_id != 'bootstrap':
287+
if node_id == 'bootstrap':
288+
self._bootstrap_fails = 0
289+
290+
elif 'bootstrap' in self._conns:
327291
bootstrap = self._conns.pop('bootstrap')
328-
# XXX: make conn.close() require error to cause refresh
329-
self._refresh_on_disconnects = False
330292
bootstrap.close()
331-
self._refresh_on_disconnects = True
332293

333294
# Connection failures imply that our metadata is stale, so let's refresh
334295
elif conn.state is ConnectionStates.DISCONNECTING:
@@ -347,7 +308,10 @@ def _conn_state_change(self, node_id, conn):
347308
idle_disconnect = True
348309
self._idle_expiry_manager.remove(node_id)
349310

350-
if self._refresh_on_disconnects and not self._closed and not idle_disconnect:
311+
if node_id == 'bootstrap':
312+
self._bootstrap_fails += 1
313+
314+
elif self._refresh_on_disconnects and not self._closed and not idle_disconnect:
351315
log.warning("Node %s connection failed -- refreshing metadata", node_id)
352316
self.cluster.request_update()
353317

@@ -362,13 +326,40 @@ def maybe_connect(self, node_id):
362326
return True
363327
return False
364328

329+
def _should_recycle_connection(self, conn):
330+
# Never recycle unless disconnected
331+
if not conn.disconnected():
332+
return False
333+
334+
# Always recycled disconnected bootstraps
335+
elif conn.node_id == 'bootstrap':
336+
return True
337+
338+
# Otherwise, only recycle when broker metadata has changed
339+
broker = self.cluster.broker_metadata(conn.node_id)
340+
if broker is None:
341+
return False
342+
343+
host, _, afi = get_ip_port_afi(broker.host)
344+
if conn.host != host or conn.port != broker.port:
345+
log.info("Broker metadata change detected for node %s"
346+
" from %s:%s to %s:%s", conn.node_id, conn.host, conn.port,
347+
broker.host, broker.port)
348+
return True
349+
350+
return False
351+
365352
def _maybe_connect(self, node_id):
366353
"""Idempotent non-blocking connection attempt to the given node id."""
367354
with self._lock:
368-
broker = self.cluster.broker_metadata(node_id)
369355
conn = self._conns.get(node_id)
370356

371357
if conn is None:
358+
# Note that when bootstrapping, each call to broker_metadata may
359+
# return a different host/port. So we need to be careful to only
360+
# call when necessary to avoid skipping some possible bootstrap
361+
# source.
362+
broker = self.cluster.broker_metadata(node_id)
372363
assert broker, 'Broker id %s not in current metadata' % (node_id,)
373364

374365
log.debug("Initiating connection to node %s at %s:%s",
@@ -382,17 +373,9 @@ def _maybe_connect(self, node_id):
382373
self._conns[node_id] = conn
383374

384375
# Check if existing connection should be recreated because host/port changed
385-
elif conn.disconnected() and broker is not None:
386-
host, _, __ = get_ip_port_afi(broker.host)
387-
if conn.host != host or conn.port != broker.port:
388-
log.info("Broker metadata change detected for node %s"
389-
" from %s:%s to %s:%s", node_id, conn.host, conn.port,
390-
broker.host, broker.port)
391-
392-
# Drop old connection object.
393-
# It will be recreated on next _maybe_connect
394-
self._conns.pop(node_id)
395-
return False
376+
elif self._should_recycle_connection(conn):
377+
self._conns.pop(node_id)
378+
return False
396379

397380
elif conn.connected():
398381
return True
@@ -713,7 +696,8 @@ def least_loaded_node(self):
713696
This method will prefer a node with an existing connection and no
714697
in-flight-requests. If no such node is found, a node will be chosen
715698
randomly from disconnected nodes that are not "blacked out" (i.e.,
716-
are not subject to a reconnect backoff).
699+
are not subject to a reconnect backoff). If no node metadata has been
700+
obtained, will return 'bootstrap' (subject to exponential backoff).
717701
718702
Returns:
719703
node_id or None if no suitable node was found
@@ -740,12 +724,8 @@ def least_loaded_node(self):
740724
if found is not None:
741725
return found
742726

743-
# some broker versions return an empty list of broker metadata
744-
# if there are no topics created yet. the bootstrap process
745-
# should detect this and keep a 'bootstrap' node alive until
746-
# a non-bootstrap node is connected and non-empty broker
747-
# metadata is available
748-
elif 'bootstrap' in self._conns:
727+
elif not nodes and self._can_bootstrap():
728+
self._last_bootstrap = time.time()
749729
return 'bootstrap'
750730

751731
return None
@@ -805,6 +785,9 @@ def _maybe_refresh_metadata(self):
805785

806786
if self._can_send_request(node_id):
807787
topics = list(self._topics)
788+
if not topics and node_id == 'bootstrap':
789+
topics = list(self.config['bootstrap_topics_filter'])
790+
808791
if self.cluster.need_all_topic_metadata or not topics:
809792
topics = [] if self.config['api_version'] < (0, 10) else None
810793
api_version = 0 if self.config['api_version'] < (0, 10) else 1

kafka/cluster.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from kafka.vendor import six
1010

1111
from kafka import errors as Errors
12+
from kafka.conn import collect_hosts, dns_lookup
1213
from kafka.future import Future
1314
from kafka.structs import BrokerMetadata, PartitionMetadata, TopicPartition
1415

@@ -29,10 +30,17 @@ class ClusterMetadata(object):
2930
which we force a refresh of metadata even if we haven't seen any
3031
partition leadership changes to proactively discover any new
3132
brokers or partitions. Default: 300000
33+
bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
34+
strings) that the client should contact to bootstrap initial
35+
cluster metadata. This does not have to be the full node list.
36+
It just needs to have at least one broker that will respond to a
37+
Metadata API Request. Default port is 9092. If no servers are
38+
specified, will default to localhost:9092.
3239
"""
3340
DEFAULT_CONFIG = {
3441
'retry_backoff_ms': 100,
3542
'metadata_max_age_ms': 300000,
43+
'bootstrap_servers': 'localhost',
3644
}
3745

3846
def __init__(self, **configs):
@@ -42,7 +50,7 @@ def __init__(self, **configs):
4250
self._groups = {} # group_name -> node_id
4351
self._last_refresh_ms = 0
4452
self._last_successful_refresh_ms = 0
45-
self._need_update = False
53+
self._need_update = True
4654
self._future = None
4755
self._listeners = set()
4856
self._lock = threading.Lock()
@@ -56,6 +64,17 @@ def __init__(self, **configs):
5664
if key in configs:
5765
self.config[key] = configs[key]
5866

67+
self._bootstrap_brokers = self._generate_bootstrap_brokers()
68+
69+
def _generate_bootstrap_brokers(self):
70+
# collect_hosts does not perform DNS, so we should be fine to re-use
71+
bootstrap_hosts = collect_hosts(self.config['bootstrap_servers'])
72+
73+
while True:
74+
for host, port, afi in bootstrap_hosts:
75+
for _, __, ___, ____, sockaddr in dns_lookup(host, port, afi):
76+
yield BrokerMetadata('bootstrap', sockaddr[0], sockaddr[1], None)
77+
5978
def brokers(self):
6079
"""Get all BrokerMetadata
6180
@@ -73,6 +92,9 @@ def broker_metadata(self, broker_id):
7392
Returns:
7493
BrokerMetadata or None if not found
7594
"""
95+
if broker_id == 'bootstrap':
96+
return next(self._bootstrap_brokers)
97+
7698
return self._brokers.get(broker_id)
7799

78100
def partitions_for_topic(self, topic):

0 commit comments

Comments
 (0)