1 Reply Latest reply on Sep 8, 2014 4:16 AM by jugglingcats

    Nodes unresponsive for short periods of time

    moia

      In our environment we have a cluster of 10 nodes running a dist cluster on 2 machines. We use ISPN 6.0.0.Final.

      Two nodes are exposed to clients and they perform operations on the cluster (let's call them API nodes). Operations are mainly gets (about 200 qps), and some modifications and puts.

      On regular basis we experience timeouts of all operations on particular node. Both API nodes report problems with one node at the same time, and the number of errors suggests, that all operations performed on that node time out.

      The first obvious explanation of this problem would be a long gc pause on unresponsive node, but I examined gc logs, and found no gc running anywhere near the time problem occured. Anyway we did gc tuning before and shortened gc pauses much below replicationTimeout set for the cluster (in this case it's 5 seconds).

      These errors give an average of 1 error per 1000 cache reads, which seems quite a lot.

      The timeout messages look  like this:

      2014-02-03 18:27:50 983 : get : 161978979:pilka cause : org.infinispan.util.concurrent.TimeoutException: Timed out waiting for 5 seconds for valid responses from any of [Sender{address=iglass1node2opinions-11619, responded=false}].

      Whate else, apart from gc, should we look at, to find the reason of these problems?

       

      Below I attach the infinispan and jgroups configuration:

      infinispan (this is a regular node setup, API nodes have capacityFactor="0")

      <?xml version="1.0" encoding="UTF-8"?>

      <infinispan xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:infinispan:config:6.0 http://www.infinispan.org/schemas/infinispan-config-6.0.xsd" xmlns="urn:infinispan:config:6.0">

        <global>

        <globalJmxStatistics enabled="true"/>

        <transport clusterName="opinionsDist" machineId="${ispn.machineId}" nodeName="${ispn.nodeId}">

        <properties>

        <property name="configurationFile" value="pl/com/agora/forum/backend/cache/jgroups-tcp.xml" />

        </properties>

        </transport>

        </global>

        <default>

        <jmxStatistics enabled="true"/>

        </default>

       

        <namedCache name="entityCache">

        <clustering mode="dist">

        <sync replTimeout="5000"/>

        <hash numOwners="1" numSegments="512" capacityFactor="1.0" factory="org.infinispan.distribution.ch.SyncConsistentHashFactory"/>

        <l1 enabled="true"/>

        </clustering>

        <eviction strategy="LIRS" maxEntries="100000" />

        <expiration lifespan="-1" maxIdle="-1" reaperEnabled="true" wakeUpInterval="10000" />

        <locking concurrencyLevel="256" isolationLevel="REPEATABLE_READ" useLockStriping="true"  />

        <transaction transactionMode="NON_TRANSACTIONAL" />

        <storeAsBinary enabled="false" />

        </namedCache>

       

       

        <namedCache name="usersVotesHistoryCache">

        <clustering mode="dist">

        <sync replTimeout="5000"/>

        <hash numOwners="1" numSegments="512" capacityFactor="1.0" factory="org.infinispan.distribution.ch.SyncConsistentHashFactory"/>

        <l1 enabled="true"/>

        </clustering>

        <eviction strategy="LIRS" maxEntries="100000" />

        <expiration lifespan="-1" maxIdle="-1" reaperEnabled="true" wakeUpInterval="10000" />

        <locking concurrencyLevel="256" isolationLevel="REPEATABLE_READ" useLockStriping="true" />

        <transaction transactionMode="NON_TRANSACTIONAL" />

        <storeAsBinary enabled="false" />

        </namedCache>

       

        <namedCache name="opinionToEntityCache">

        <clustering mode="dist">

        <sync replTimeout="5000"/>

        <hash numOwners="1" numSegments="512" capacityFactor="1.0" factory="org.infinispan.distribution.ch.SyncConsistentHashFactory"/>

        <l1 enabled="true"/>

        </clustering>

        <eviction strategy="LIRS" maxEntries="100000" />

        <expiration lifespan="-1" maxIdle="-1" reaperEnabled="true" wakeUpInterval="10000" />

        <locking concurrencyLevel="256" isolationLevel="REPEATABLE_READ" useLockStriping="true" />

        <transaction transactionMode="NON_TRANSACTIONAL" />

        <storeAsBinary enabled="false" />

        </namedCache>

       

       

      </infinispan>

      jgroups:

      <config xmlns="urn:org:jgroups"

              xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

              xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/JGroups-3.4.xsd">

         <TCP

              bind_addr="${jgroups.tcp.address}"

              bind_port="${jgroups.tcp.port:7800}"

              loopback="true"

              port_range="30"

              recv_buf_size="20m"

              send_buf_size="640k"

              max_bundle_size="31k"

              use_send_queues="true"

              enable_diagnostics="false"

              bundler_type="old"

       

       

              thread_naming_pattern="pl"

       

       

              thread_pool.enabled="true"

              thread_pool.min_threads="2"

              thread_pool.max_threads="30"

              thread_pool.keep_alive_time="60000"

              thread_pool.queue_enabled="true"

              thread_pool.queue_max_size="100"

              thread_pool.rejection_policy="Discard"

       

       

              oob_thread_pool.enabled="true"

              oob_thread_pool.min_threads="2"

              oob_thread_pool.max_threads="30"

              oob_thread_pool.keep_alive_time="60000"

              oob_thread_pool.queue_enabled="false"

              oob_thread_pool.queue_max_size="100"

              oob_thread_pool.rejection_policy="Discard"

       

       

              internal_thread_pool.enabled="true"

              internal_thread_pool.min_threads="1"

              internal_thread_pool.max_threads="10"

              internal_thread_pool.keep_alive_time="60000"

              internal_thread_pool.queue_enabled="true"

              internal_thread_pool.queue_max_size="100"

              internal_thread_pool.rejection_policy="Discard"

              />

       

       

         <!-- Ergonomics, new in JGroups 2.11, are disabled by default in TCPPING until JGRP-1253 is resolved -->

       

         <FILE_PING timeout="3000" num_initial_members="1" location="${jgroups.fileping.location:/exp/forum-backend/infinispan/infinispan-60-dist}"/> 

         <MERGE2 max_interval="30000" min_interval="10000"/>

       

       

         <FD_SOCK/>

         <FD timeout="3000" max_tries="5"/>

         <VERIFY_SUSPECT timeout="1500"/>

       

       

         <pbcast.NAKACK2 use_mcast_xmit="false"

                         xmit_interval="1000"

                         xmit_table_num_rows="100"

                         xmit_table_msgs_per_row="10000"

                         xmit_table_max_compaction_time="10000"

                         max_msg_batch_size="100"/>

         <UNICAST3 xmit_interval="500"

                   xmit_table_num_rows="20"

                   xmit_table_msgs_per_row="10000"

                   xmit_table_max_compaction_time="10000"

                   max_msg_batch_size="100"

                   conn_expiry_timeout="0"

                   conn_close_timeout="3000"/>

       

       

         <pbcast.STABLE stability_delay="500" desired_avg_gossip="5000" max_bytes="1m"/>

         <pbcast.GMS print_local_addr="false" join_timeout="3000" view_bundling="true"/>

         <tom.TOA/> <!-- the TOA is only needed for total order transactions-->

       

       

         <UFC max_credits="2m" min_threshold="0.40"/>

         <MFC max_credits="2m" min_threshold="0.40"/>

         <FRAG2 frag_size="30k"/>

         <RSVP timeout="60000" resend_interval="500" ack_on_delivery="false" />

      </config>

       

      Best regards,

      Mikolaj