1 Reply Latest reply on Mar 28, 2008 7:43 AM by mircea.markus

2 node in cluster being stuck when GC runs due to JBC?

jorgemoralespou_2 Mar 28, 2008 7:24 AM

Hi,
We have a 2 node cluster. The only clustering is done through JBC, with configurations like the one below. We are seeing that when in one node there is a FullGC with a 10 seconds "Stop the world", requests in the other node get delayed by 10 seconds. Since we only have the cache shared between this 2 nodes, this has to be the cause.
Have anyone seen behaviour like this before? is it normal? how can I avoid it?

<mbean code="com.mycompany.som.cache.core.mbean.CoreCacheWithListener"
 name="jboss.cache:service=SOMCoreCache">

 <depends>jboss:service=Naming</depends>
 <depends>jboss:service=TransactionManager</depends>
 <depends>jboss.jca:name=jdbc/som,service=DataSourceBinding</depends>

 <depends>jboss:service=DeploymentService</depends>

 <attribute name="TransactionManagerLookupClass">
 org.jboss.cache.JBossTransactionManagerLookup
 </attribute>

 <attribute name="IsolationLevel">NONE</attribute>
 <attribute name="CacheMode">REPL_SYNC</attribute>
 <attribute name="UseReplQueue">false</attribute>
 <attribute name="ReplQueueInterval">0</attribute>
 <attribute name="ReplQueueMaxElements">0</attribute>
 <attribute name="ClusterName">SOMCoreCache-Cluster</attribute>

 <attribute name="ClusterConfig">
 <config>
 <!-- UDP: if you have a multihomed machine,
 set the bind_addr attribute to the appropriate NIC IP address, e.g bind_addr="192.168.0.2"
 -->
 <!-- UDP: On Windows machines, because of the media sense feature
 being broken with multicast (even after disabling media sense)
 set the loopback attribute to true -->
 <UDP mcast_addr="${jboss.cache.SOMCoreCache.addr:228.1.2.3}"
 mcast_port="${jboss.cache.SOMCoreCache.port:48866}" ip_ttl="64"
 ip_mcast="true" mcast_send_buf_size="150000"
 mcast_recv_buf_size="80000" ucast_send_buf_size="150000"
 ucast_recv_buf_size="80000" loopback="false" />
 <PING timeout="2000" num_initial_members="3" up_thread="false"
 down_thread="false" />
 <MERGE2 min_interval="10000" max_interval="20000" />
 <!-- <FD shun="true" up_thread="true" down_thread="true" />-->
 <FD_SOCK />
 <VERIFY_SUSPECT timeout="1500" up_thread="false"
 down_thread="false" />
 <pbcast.NAKACK gc_lag="50" retransmit_timeout="600,1200,2400,4800"
 max_xmit_size="8192" up_thread="false" down_thread="false" />
 <UNICAST timeout="600,1200,2400" down_thread="false" />
 <pbcast.STABLE desired_avg_gossip="20000" up_thread="false"
 down_thread="false" />
 <FRAG frag_size="8192" down_thread="false" up_thread="false" />
 <pbcast.GMS join_timeout="5000" join_retry_timeout="2000"
 shun="true" print_local_addr="true" />
 <pbcast.STATE_TRANSFER up_thread="true" down_thread="true" />
 </config>
 </attribute>

 <attribute name="FetchInMemoryState">true</attribute>
 <attribute name="InitialStateRetrievalTimeout">15000</attribute>
 <attribute name="SyncReplTimeout">15000</attribute>
 <attribute name="LockAcquisitionTimeout">10000</attribute>
 <attribute name="UseRegionBasedMarshalling">true</attribute>
 <attribute name="InactiveOnStartup">false</attribute>

 <attribute name="CacheLoaderConfiguration">
 <config>
 <passivation>false</passivation>
 <preload>/</preload>
 <shared>true</shared>
 <cacheloader>
 <class>org.jboss.cache.loader.JDBCCacheLoader</class>
 <properties>
 cache.jdbc.datasource=java:jdbc/som
 cache.jdbc.table.name=core_data
 cache.jdbc.table.create=true
 cache.jdbc.table.drop=false
 cache.jdbc.table.primarykey=jbosscache_pk
 cache.jdbc.fqn.column=fqn
 cache.jdbc.fqn.type=varchar(255)
 cache.jdbc.node.column=node
 cache.jdbc.node.type=blob
 cache.jdbc.parent.column=parent
 </properties>
 <async>true</async>
 <fetchPersistentState>true</fetchPersistentState>
 <ignoreModifications>false</ignoreModifications>
 </cacheloader>
 </config>
 </attribute>

 </mbean>
</server>

1. Re: 2 node in cluster being stuck when GC runs due to JBC?

mircea.markus Mar 28, 2008 7:43 AM (in response to jorgemoralespou_2)

As cache is in SYNC mode I would say that this is 'normal' behavior. The non-GC cache waits for acknowledgment from the frozen cache until it received it or times out (SyncReplTimeout set to 15secs in your case)
If you cannot tune the GC to avoid "Stop the world" scenarios, here are some suggestions from a cache perspective:
1) use ASYNC replication if business allows it. This way the non-GC cache won't wait for the frozen cache to ACK
2) reduce the SyncReplTimeout. If so the non-GC cache will throw an exception on operation, and give quicker feedback to the user.
Actions