Unresponsive cluster node after weeks of running with REPLICATION_ASYNC
jugglingcats Oct 23, 2014 4:57 PMWe we are running 6.0.0-Final in a three node REPLICATION_ASYNC cluster. Our cluster does a steady 80 cache reads and 60 cache puts a second.
After a week or two of running sometimes the cluster starts to fail. Normally one of the nodes in the cluster becomes unresponsive. Today I got a thread dump of the unresponsive node and am looking at it using http://mchr3k.github.io/javathreaddumpanalyser/.
What I can see is a large number of threads waiting on the invalidation message being sent to the other nodes in the cluster. Invalidation is async, so I assume this means the outgoing queue is full and therefore blocking.
Total threads: 838
423 threads with trace:
States: {WAITING=423}
Stack:
- sun.misc.Unsafe.park(Native Method)
- - parking to wait for <IGNORED> (a java.util.concurrent.locks.ReentrantLock$NonfairSync)
- java.util.concurrent.locks.LockSupport.park(LockSupport.java:156)
- java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:811)
- java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:842)
- java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1178)
- java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:186)
- java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:262)
- org.jgroups.protocols.TP$DefaultBundler.send(TP.java:2155)
- org.jgroups.protocols.TP.send(TP.java:1641)
- org.jgroups.protocols.TP.down(TP.java:1336)
- org.jgroups.protocols.Discovery.down(Discovery.java:597)
- com.xxxxxx.ad.cache.infinispan.MongoPing.down(MongoPing.java:91)
- org.jgroups.protocols.MERGE2.down(MERGE2.java:185)
- org.jgroups.protocols.FD_SOCK.down(FD_SOCK.java:363)
- org.jgroups.protocols.FD.down(FD.java:307)
- org.jgroups.protocols.VERIFY_SUSPECT.down(VERIFY_SUSPECT.java:84)
- org.jgroups.protocols.pbcast.NAKACK2.send(NAKACK2.java:731)
- org.jgroups.protocols.pbcast.NAKACK2.down(NAKACK2.java:461)
- org.jgroups.protocols.UNICAST3.down(UNICAST3.java:575)
- org.jgroups.protocols.pbcast.STABLE.down(STABLE.java:347)
- org.jgroups.protocols.pbcast.GMS.down(GMS.java:1075)
- org.jgroups.protocols.tom.TOA.handleDownMessage(TOA.java:189)
- org.jgroups.protocols.tom.TOA.down(TOA.java:64)
- org.jgroups.protocols.FlowControl.down(FlowControl.java:340)
- org.jgroups.protocols.MFC.handleDownMessage(MFC.java:114)
- org.jgroups.protocols.FlowControl.down(FlowControl.java:330)
- org.jgroups.protocols.FRAG2.down(FRAG2.java:136)
- org.jgroups.protocols.RSVP.down(RSVP.java:142)
- org.jgroups.stack.ProtocolStack.down(ProtocolStack.java:1024)
- org.jgroups.JChannel.down(JChannel.java:760)
- org.jgroups.blocks.MessageDispatcher$ProtocolAdapter.down(MessageDispatcher.java:683)
- org.jgroups.blocks.RequestCorrelator.sendRequest(RequestCorrelator.java:172)
- org.jgroups.blocks.GroupRequest.sendRequest(GroupRequest.java:360)
- org.jgroups.blocks.GroupRequest.sendRequest(GroupRequest.java:103)
- org.jgroups.blocks.Request.execute(Request.java:83)
- org.jgroups.blocks.MessageDispatcher.cast(MessageDispatcher.java:359)
- org.jgroups.blocks.MessageDispatcher.cast(MessageDispatcher.java:365)
- org.jgroups.blocks.MessageDispatcher.castMessage(MessageDispatcher.java:255)
- org.infinispan.remoting.transport.jgroups.CommandAwareRpcDispatcher.processCalls(CommandAwareRpcDispatcher.java:399)
- org.infinispan.remoting.transport.jgroups.CommandAwareRpcDispatcher.invokeRemoteCommands(CommandAwareRpcDispatcher.java:132)
- org.infinispan.remoting.transport.jgroups.CommandAwareRpcDispatcher.broadcastRemoteCommands(CommandAwareRpcDispatcher.java:187)
- org.infinispan.remoting.transport.jgroups.JGroupsTransport.invokeRemotely(JGroupsTransport.java:500)
- org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:281)
- org.infinispan.interceptors.InvalidationInterceptor.invalidateAcrossCluster(InvalidationInterceptor.java:227)
- org.infinispan.interceptors.InvalidationInterceptor.handleInvalidate(InvalidationInterceptor.java:143)
- org.infinispan.interceptors.InvalidationInterceptor.visitPutKeyValueCommand(InvalidationInterceptor.java:80)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.CacheWriterInterceptor.visitPutKeyValueCommand(CacheWriterInterceptor.java:161)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.CacheLoaderInterceptor.visitPutKeyValueCommand(CacheLoaderInterceptor.java:96)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.EntryWrappingInterceptor.invokeNextAndApplyChanges(EntryWrappingInterceptor.java:321)
- org.infinispan.interceptors.EntryWrappingInterceptor.setSkipRemoteGetsAndInvokeNextForDataCommand(EntryWrappingInterceptor.java:401)
- org.infinispan.interceptors.EntryWrappingInterceptor.visitPutKeyValueCommand(EntryWrappingInterceptor.java:163)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.locking.AbstractLockingInterceptor.visitPutKeyValueCommand(AbstractLockingInterceptor.java:45)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.base.CommandInterceptor.handleDefault(CommandInterceptor.java:112)
- org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:32)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.CacheMgmtInterceptor.updateStoreStatistics(CacheMgmtInterceptor.java:148)
- org.infinispan.interceptors.CacheMgmtInterceptor.visitPutKeyValueCommand(CacheMgmtInterceptor.java:134)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:98)
- org.infinispan.interceptors.InvocationContextInterceptor.handleAll(InvocationContextInterceptor.java:106)
- org.infinispan.interceptors.InvocationContextInterceptor.handleDefault(InvocationContextInterceptor.java:70)
- org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:32)
- org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:62)
- org.infinispan.interceptors.InterceptorChain.invoke(InterceptorChain.java:321)
- org.infinispan.CacheImpl.executeCommandAndCommitIfNeeded(CacheImpl.java:1296)
- org.infinispan.CacheImpl.putInternal(CacheImpl.java:869)
- org.infinispan.CacheImpl.put(CacheImpl.java:861)
- org.infinispan.DecoratedCache.put(DecoratedCache.java:401)
- com.xxxxxx.ad.mixin.core.DefaultBuilder.store(DefaultBuilder.java:132)
- com.xxxxxx.ad.mixin.core.DefaultBuilder.passivate(DefaultBuilder.java:142)
- com.xxxxxx.ad.session.SessionSource.passivate(SessionSource.java:136)
- com.xxxxxx.ad.web.support.SessionRequestExecutor.with(SessionRequestExecutor.java:116)
- com.xxxxxx.ad.web.AuthorisationResource.authorizeEsi(AuthorisationResource.java:112)
- sun.reflect.GeneratedMethodAccessor59.invoke(Unknown Source)
- sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
- java.lang.reflect.Method.invoke(Method.java:597)
- com.sun.jersey.spi.container.JavaMethodInvokerFactory$1.invoke(JavaMethodInvokerFactory.java:60)
- com.sun.jersey.server.impl.model.method.dispatch.AbstractResourceMethodDispatchProvider$ResponseOutInvoker._dispatch(AbstractResourceMethodDispatchProvider.java:205)
- com.sun.jersey.server.impl.model.method.dispatch.ResourceJavaMethodDispatcher.dispatch(ResourceJavaMethodDispatcher.java:75)
- com.codahale.metrics.jersey.InstrumentedResourceMethodDispatchProvider$ExceptionMeteredRequestDispatcher.dispatch(InstrumentedResourceMethodDispatchProvider.java:69)
- com.sun.jersey.server.impl.uri.rules.HttpMethodRule.accept(HttpMethodRule.java:288)
- com.sun.jersey.server.impl.uri.rules.RightHandPathRule.accept(RightHandPathRule.java:147)
- com.sun.jersey.server.impl.uri.rules.ResourceClassRule.accept(ResourceClassRule.java:108)
- com.sun.jersey.server.impl.uri.rules.RightHandPathRule.accept(RightHandPathRule.java:147)
- com.sun.jersey.server.impl.uri.rules.RootResourceClassesRule.accept(RootResourceClassesRule.java:84)
- com.sun.jersey.server.impl.application.WebApplicationImpl._handleRequest(WebApplicationImpl.java:1469)
- com.sun.jersey.server.impl.application.WebApplicationImpl._handleRequest(WebApplicationImpl.java:1400)
- com.sun.jersey.server.impl.application.WebApplicationImpl.handleRequest(WebApplicationImpl.java:1349)
- com.sun.jersey.server.impl.application.WebApplicationImpl.handleRequest(WebApplicationImpl.java:1339)
- com.sun.jersey.spi.container.servlet.WebComponent.service(WebComponent.java:416)
- com.sun.jersey.spi.container.servlet.ServletContainer.service(ServletContainer.java:537)
- com.sun.jersey.spi.container.servlet.ServletContainer.service(ServletContainer.java:699)
- javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
- org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
- org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
- org.springframework.web.filter.CompositeFilter$VirtualFilterChain.doFilter(CompositeFilter.java:98)
- com.xxxxxx.ad.web.security.ic.IdentityCoreSessionValidationFilter.doFilter(IdentityCoreSessionValidationFilter.java:84)
- org.springframework.web.filter.CompositeFilter$VirtualFilterChain.doFilter(CompositeFilter.java:102)
- org.springframework.web.filter.CompositeFilter.doFilter(CompositeFilter.java:82)
- org.springframework.web.filter.DelegatingFilterProxy.invokeDelegate(DelegatingFilterProxy.java:343)
- org.springframework.web.filter.DelegatingFilterProxy.doFilter(DelegatingFilterProxy.java:260)
- org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
- org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
- org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
- org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
- org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
- org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
- org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
- org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:293)
- org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:859)
- org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:602)
- org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
- java.lang.Thread.run(Thread.java:662)
There are another 106 + 7 threads with very similar call stacks. So in total 536 threads are waiting for the invalidation message to be sent.
Our jgroups config is below. My questions are:
- Which thread pool requires tuning for the outbound invalidation message? I assume it's either thread_pool.queue_max_size or internal_thread_pool.queue_max_size
- Does rejection_policy=Discard imply that when the queue is full the message will be dropped?
- What other values for rejection_policy are available?
Any other suggestions for troubleshooting the issue much appreciated. Thanks.
jgroups config...
<config xmlns="urn:org:jgroups"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/JGroups-3.4.xsd">
<TCP
bind_addr="${jgroups.tcp.address:127.0.0.1}"
bind_port="${jgroups.tcp.port:7800}"
loopback="true"
port_range="${jgroups.tcp.port_range:10}"
recv_buf_size="20000000"
send_buf_size="640000"
max_bundle_size="64k"
use_send_queues="true"
sock_conn_timeout="${jgroups.tcp.sock_conn_timeout:300}"
enable_diagnostics="false"
bundler_type="${jgroups.tcp.bundler_type:old}"
thread_pool.enabled="true"
thread_pool.min_threads="2"
thread_pool.max_threads="${jgroups.thread_pool.max_threads:100}"
thread_pool.keep_alive_time="60000"
thread_pool.queue_enabled="${jgroups.thread_pool.queue_enabled:false}"
thread_pool.queue_max_size="100"
thread_pool.rejection_policy="Discard"
oob_thread_pool.enabled="true"
oob_thread_pool.min_threads="2"
oob_thread_pool.max_threads="${jgroups.oob_thread_pool.max_threads:100}"
oob_thread_pool.keep_alive_time="60000"
oob_thread_pool.queue_enabled="false"
oob_thread_pool.queue_max_size="100"
oob_thread_pool.rejection_policy="Discard"
internal_thread_pool.enabled="true"
internal_thread_pool.min_threads="1"
internal_thread_pool.max_threads="${jgroups.tcp.internal_thread_pool.max_threads:100}"
internal_thread_pool.keep_alive_time="60000"
internal_thread_pool.queue_enabled="true"
internal_thread_pool.queue_max_size="100"
internal_thread_pool.rejection_policy="Discard"
/>
<com.xxxxxx.ad.cache.infinispan.MongoPing/>
<MERGE2 max_interval="30000"
min_interval="10000"/>
<FD_SOCK/>
<FD timeout="3000" max_tries="5"/>
<VERIFY_SUSPECT timeout="1500"/>
<pbcast.NAKACK2
use_mcast_xmit="false"
xmit_interval="1000"
xmit_table_num_rows="100" xmit_table_msgs_per_row="10000"
xmit_table_max_compaction_time="10000"
max_msg_batch_size="100"/>
<UNICAST3
xmit_table_num_rows="20"
xmit_table_msgs_per_row="10000"
xmit_table_max_compaction_time="10000"
max_msg_batch_size="100"
conn_expiry_timeout="0"/>
<pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000"
max_bytes="400000"/>
<pbcast.GMS print_local_addr="false" join_timeout="7000" view_bundling="true"/>
<tom.TOA/> <!-- the TOA is only needed for total order transactions-->
<UFC max_credits="2m" min_threshold="0.40"/>
<MFC max_credits="2m" min_threshold="0.40"/>
<FRAG2 frag_size="30k" />
<RSVP timeout="60000" resend_interval="500" ack_on_delivery="false"/>
</config>