11 Replies Latest reply on Dec 17, 2008 6:44 AM by nachiket_patel

    What's wrong with this configuration??

    nachiket_patel

      Hello guys,
      After struggling for 6 to 8 hours, without any clue,
      I am posting my configuration here.

      --: My problem :--
      After disconnection of AppInstance1, Updated data on "PojoCache" by AppInstace2 is not reflected (changed) in AppInstance1 after reconnection.

      But if i configure using UDP, It synchronizes data between AppInstance1 and AppInstance2 after reconnection.

      I am running GossipRouter using command line :
      java -cp jgroups.jar;commons-logging.jar org.jgroups.stack.GossipRouter -port 5555 -bindaddress 150.0.149.105

      -------------------------------------------------------------------




      jboss:service=Naming
      jboss:service=TransactionManager
      org.jboss.cache.transaction.GenericTransactionManagerLookup
      REPEATABLE_READ
      REPL_ASYNC
      false
      0
      0
      JBossCache-Cluster


      <TCP start_port="7800"
      loopback = "true"
      discard_incompatible_packets="true"
      max_bundle_size="64000"
      max_bundle_timeout="30"
      use_incoming_packet_handler="true"
      enable_bundling="false"
      enable_diagnostics="true"
      use_concurrent_stack="true"
      thread_naming_pattern="pl"

      thread_pool.enabled="true"
      thread_pool.min_threads="2"
      thread_pool.max_threads="8"
      thread_pool.keep_alive_time="30000"
      thread_pool.queue_enabled="true"
      thread_pool.queue_max_size="10000"
      thread_pool.rejection_policy="run"

      oob_thread_pool.enabled="true"
      oob_thread_pool.min_threads="1"
      oob_thread_pool.max_threads="8"
      oob_thread_pool.keep_alive_time="5000"
      oob_thread_pool.queue_enabled="false"
      oob_thread_pool.queue_max_size="100"
      oob_thread_pool.rejection_policy="Run"/>

      <PING gossip_host="150.0.149.105" gossip_port="5555" gossip_refresh="5000"
      timeout="2000" num_initial_members="3"/>

      <MERGE2 max_interval="30000" min_interval="10000"/>
      <FD_SOCK num_tries="3" suspect_msg_interval="3000"/>
      <FD timeout="3000" max_tries="4" shun="true"/>
      <VERIFY_SUSPECT timeout="1500" />

      <pbcast.NAKACK use_stats_for_retransmission="true"
      exponential_backoff="150"
      use_mcast_xmit="true" gc_lag="0"
      retransmit_timeout="50,300,600,1200,2400"
      discard_delivered_msgs="true"/>


      <pbcast.STABLE stability_delay="1000" desired_avg_gossip="40000"
      max_bytes="1000000"/>

      <VIEW_SYNC avg_send_interval="60000" />
      <pbcast.GMS print_local_addr="true" join_timeout="3000"
      shun="false" view_bundling="true" view_ack_collection_timeout="2000"/>

      <FC max_credits="500000" min_threshold="0.20"/>
      <FRAG2 frag_size="60000" />
      <pbcast.STREAMING_STATE_TRANSFER use_reading_thread="true" />
      <pbcast.FLUSH timeout="0"/>


      true
      15000
      15000
      10000
      false




      Thanks...

        • 1. Re: What's wrong with this configuration??
          nachiket_patel

          Sorry,
          I don't know, why XML is removed, I have pasted properly. I am pasting it again....
          Here it is..

          <mbean code="org.jboss.cache.jmx.CacheJmxWrapper"
           name="jboss.cache:service=TreeCache">
          
           <depends>jboss:service=Naming</depends>
           <depends>jboss:service=TransactionManager</depends>
           <attribute name="TransactionManagerLookupClass">org.jboss.cache.transaction.GenericTransactionManagerLookup </attribute>
           <attribute name="IsolationLevel">REPEATABLE_READ</attribute>
           <attribute name="CacheMode">REPL_ASYNC</attribute>
           <attribute name="UseReplQueue">false</attribute>
           <attribute name="ReplQueueInterval">0</attribute>
           <attribute name="ReplQueueMaxElements">0</attribute>
           <attribute name="ClusterName">JBossCache-Cluster</attribute>
           <attribute name="ClusterConfig">
           <config>
           <TCP start_port="7800"
           loopback = "true"
           discard_incompatible_packets="true"
           max_bundle_size="64000"
           max_bundle_timeout="30"
           use_incoming_packet_handler="true"
           enable_bundling="false"
           enable_diagnostics="true"
           use_concurrent_stack="true"
           thread_naming_pattern="pl"
          
           thread_pool.enabled="true"
           thread_pool.min_threads="2"
           thread_pool.max_threads="8"
           thread_pool.keep_alive_time="30000"
           thread_pool.queue_enabled="true"
           thread_pool.queue_max_size="10000"
           thread_pool.rejection_policy="run"
          
           oob_thread_pool.enabled="true"
           oob_thread_pool.min_threads="1"
           oob_thread_pool.max_threads="8"
           oob_thread_pool.keep_alive_time="5000"
           oob_thread_pool.queue_enabled="false"
           oob_thread_pool.queue_max_size="100"
           oob_thread_pool.rejection_policy="Run"/>
          
           <PING gossip_host="150.0.149.105" gossip_port="5555" gossip_refresh="5000"
           timeout="2000" num_initial_members="3"/>
          
           <MERGE2 max_interval="30000" min_interval="10000"/>
           <FD_SOCK num_tries="3" suspect_msg_interval="3000"/>
           <FD timeout="3000" max_tries="4" shun="true"/>
           <VERIFY_SUSPECT timeout="1500" />
           <BARRIER />
           <pbcast.NAKACK use_stats_for_retransmission="true"
           exponential_backoff="150"
           use_mcast_xmit="true" gc_lag="0"
           retransmit_timeout="50,300,600,1200,2400"
           discard_delivered_msgs="true"/>
          
           <UNICAST timeout="300,600,1200,2400,3600"/>
           <pbcast.STABLE stability_delay="1000" desired_avg_gossip="40000"
           max_bytes="1000000"/>
          
           <VIEW_SYNC avg_send_interval="60000" />
           <pbcast.GMS print_local_addr="true" join_timeout="3000"
           shun="false" view_bundling="true" view_ack_collection_timeout="2000"/>
          
           <FC max_credits="500000" min_threshold="0.20"/>
           <FRAG2 frag_size="60000" />
           <pbcast.STREAMING_STATE_TRANSFER use_reading_thread="true" />
           <pbcast.FLUSH timeout="0"/>
           </config>
           </attribute>
           <attribute name="FetchInMemoryState">true</attribute>
           <attribute name="StateRetrievalTimeout">15000</attribute>
           <attribute name="SyncReplTimeout">15000</attribute>
           <attribute name="LockAcquisitionTimeout">10000</attribute>
           <attribute name="UseRegionBasedMarshalling">false</attribute>
           </mbean>



          UDP configuration: In which data gets sync after reconnection.
          Other configuration is same. So pasting only UDP block

          <config>
           <UDP mcast_addr="228.20.10.10"
           mcast_port="45588"
           tos="8"
           ucast_recv_buf_size="20000000"
           ucast_send_buf_size="640000"
           mcast_recv_buf_size="25000000"
           mcast_send_buf_size="640000"
           loopback="false"
           discard_incompatible_packets="true"
           max_bundle_size="64000"
           max_bundle_timeout="30"
           use_incoming_packet_handler="true"
           ip_ttl="2"
           enable_bundling="false"
           enable_diagnostics="true"
          
           use_concurrent_stack="true"
          
           thread_naming_pattern="pl"
          
           thread_pool.enabled="true"
           thread_pool.min_threads="1"
           thread_pool.max_threads="25"
           thread_pool.keep_alive_time="30000"
           thread_pool.queue_enabled="true"
           thread_pool.queue_max_size="10"
           thread_pool.rejection_policy="Run"
          
           oob_thread_pool.enabled="true"
           oob_thread_pool.min_threads="1"
           oob_thread_pool.max_threads="4"
           oob_thread_pool.keep_alive_time="10000"
           oob_thread_pool.queue_enabled="true"
           oob_thread_pool.queue_max_size="10"
           oob_thread_pool.rejection_policy="Run"/>
          
           <PING timeout="2000" num_initial_members="3"/>
           <MERGE2 max_interval="30000" min_interval="10000"/>
           <FD_SOCK/>
           <FD timeout="5000" max_tries="4" shun="true"/>
           <VERIFY_SUSPECT timeout="1500"/>
           <pbcast.NAKACK max_xmit_size="60000"
           use_mcast_xmit="false" gc_lag="0"
           retransmit_timeout="300,600,1200,2400,4800"
           discard_delivered_msgs="true"/>
           <UNICAST timeout="300,600,1200,2400,3600"/>
           <pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000"
           max_bytes="400000"/>
           <pbcast.GMS print_local_addr="true" join_timeout="5000"
           join_retry_timeout="2000" shun="false"
           view_bundling="true" view_ack_collection_timeout="5000"/>
           <FRAG2 frag_size="60000"/>
           <pbcast.STREAMING_STATE_TRANSFER use_reading_thread="true"/>
           <!-- <pbcast.STATE_TRANSFER/> -->
           <pbcast.FLUSH timeout="0"/>
           </config>


          Thanks....

          • 2. Re: What's wrong with this configuration??
            nachiket_patel

            Hello,
            This is application instance log, this application i disconnected from network. And Another application added 3 elements in shared Hashmap.
            When it connects back, it receives 3 messages but ignores it. As it is not added in view.

            I think these 3 messages are Update messages, but it is coming before it is added in view, Is it because of some wrong configuration (of time)??


            ---VIEW EVENT : View: [150.0.149.108:7800|2] [150.0.149.108:7800]
            
            2008-12-16 11:09:55,786 ERROR [GossipClient] (Timer-3,150.0.149.108:7800) exception connecting to host 150.0.149.105:5555
            ..........
            2008-12-16 11:09:56,786 ERROR [GossipClient] (Timer-4,150.0.149.108:7800) exception connecting to host 150.0.149.105:5555
            2008-12-16 11:09:58,223 ERROR [GossipClient] (Timer-1) exception connecting to host 150.0.149.105:5555
            2008-12-16 11:10:02,161 ERROR [GossipClient] (Timer-3,150.0.149.108:7800) exception connecting to host 150.0.149.105:5555
            2008-12-16 11:10:03,176 ERROR [GossipClient] (Timer-3,150.0.149.108:7800) exception connecting to host 150.0.149.105:5555
            2008-12-16 11:10:03,208 ERROR [GossipClient] (Timer-1) exception connecting to host 150.0.149.105:5555
            2008-12-16 11:10:08,223 ERROR [GossipClient] (Timer-1) exception connecting to host 150.0.149.105:5555
            2008-12-16 11:10:10,880 WARN [FD] (OOB-4,150.0.149.108:7800) I was suspected by 150.0.149.105:7800; ignoring the SUSPECT message and sending back a HEARTBEAT_ACK
            2008-12-16 11:10:14,348 WARN [NAKACK] (Incoming-1,150.0.149.108:7800) 150.0.149.108:7800] discarded message from non-member 150.0.149.105:7800, my view is [150.0.149.108:7800|2] [150.0.149.108:7800]
            2008-12-16 11:10:19,208 WARN [NAKACK] (Incoming-2,150.0.149.108:7800) 150.0.149.108:7800] discarded message from non-member 150.0.149.105:7800, my view is [150.0.149.108:7800|2] [150.0.149.108:7800]
            2008-12-16 11:10:19,380 WARN [NAKACK] (Incoming-1,150.0.149.108:7800) 150.0.149.108:7800] discarded message from non-member 150.0.149.105:7800, my view is [150.0.149.108:7800|2] [150.0.149.108:7800]
            2008-12-16 11:10:23,301 ERROR [ConnectionTable] (ConnectionTable.Connection.Sender local_addr=150.0.149.108:7800 [150.0.149.108:2313 - 150.0.149.105:7800],150.0.149.108:7800) failed sending data to 150.0.149.105:7800: java.net.SocketException: Socket closed
            
            ---VIEW EVENT :
            View: [150.0.149.108:7800|3] [150.0.149.108:7800, 150.0.149.105:7800]


            • 3. Re: What's wrong with this configuration??
              nachiket_patel

              Again, I have changed my configuration to v 3.0, I never new that there is new configuration file, i copied previous from user guide of "3.0".

              Still Problem remains (i am unable to sync 2 instances of application after reconnection.)

              <jbosscache xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
              xmlns="urn:jboss:jbosscache-core:config:3.0" xsi:schemaLocation='urn:jboss:jbosscache-core:config:3.0 ../jbosscache-config-3.0.xsd
               urn:jboss:jbosscache-core:cache-repo:3.0 ../jbosscache-registry-3.0.xsd'>
              
              
               <transaction
              transactionManagerLookupClass="org.jboss.cache.transaction.GenericTransactionManagerLookup"
               syncRollbackPhase="false"
               syncCommitPhase="false"/>
              
               <locking
               isolationLevel="REPEATABLE_READ"
               lockParentForChildInsertRemove="false"
               lockAcquisitionTimeout="20000"
               nodeLockingScheme="mvcc"
               writeSkewCheck="false"
               concurrencyLevel="500"/>
              
               <clustering mode="replication" clusterName="JBossCache-cluster">
               <stateRetrieval timeout="20000" fetchInMemoryState="true"/>
               <async useReplQueue="false"
               replQueueInterval="0"
               replQueueMaxElements="0"
               serializationExecutorPoolSize="20"
               serializationExecutorQueueSize="5000000"/>
              
              
               <jgroupsConfig>
              
               <TCP start_port="7800"
               loopback = "true"
               discard_incompatible_packets="true"
               max_bundle_size="64000"
               max_bundle_timeout="30"
               use_incoming_packet_handler="true"
               enable_bundling="true"
               enable_diagnostics="true"
               use_concurrent_stack="true"
               thread_naming_pattern="pl"
              
               thread_pool.enabled="true"
               thread_pool.min_threads="2"
               thread_pool.max_threads="8"
               thread_pool.keep_alive_time="30000"
               thread_pool.queue_enabled="true"
               thread_pool.queue_max_size="10000"
               thread_pool.rejection_policy="run"
              
               oob_thread_pool.enabled="true"
               oob_thread_pool.min_threads="1"
               oob_thread_pool.max_threads="8"
               oob_thread_pool.keep_alive_time="5000"
               oob_thread_pool.queue_enabled="false"
               oob_thread_pool.queue_max_size="100"
               oob_thread_pool.rejection_policy="Run"/>
              
               <TCPGOSSIP initial_hosts="150.0.149.105[5555]" gossip_refresh_rate="5000"
               num_initial_members="1" />
              
               <MERGE2 max_interval="10000" min_interval="3000"/>
               <FD_SOCK num_tries="3" suspect_msg_interval="1000"/>
               <FD timeout="5000" max_tries="3" shun="true"/>
              
               <VERIFY_SUSPECT timeout="1500" />
              
               <pbcast.NAKACK
               use_mcast_xmit="false"
               gc_lag="0"
               retransmit_timeout="300,600,1200,2400,4800"
               discard_delivered_msgs="true"
               print_stability_history_on_failed_xmit= "true"/>
              
               <UNICAST timeout="300,600,1200,2400,3600"/>
              
               <pbcast.STABLE stability_delay="1000"
               desired_avg_gossip="50000"
               max_bytes="400000"/>
              
              
               <pbcast.GMS print_local_addr="true" join_timeout="2000"
               shun="true" view_bundling="false" view_ack_collection_timeout="3000"/>
              
              
               <FRAG2 frag_size="60000" />
               <pbcast.STATE_TRANSFER />
               <pbcast.FLUSH timeout="0"/>
              
               </jgroupsConfig>
               </clustering>
              </jbosscache>


              Guys, Help me out.

              Thanks....

              • 4. Re: What's wrong with this configuration??
                jason.greene

                Why are you using gossip? You dont need a gossip router to do TCP. Also gossip isn't preferable for a cache setup.

                BTW, if you pull the cable long enough for a network split/partition to occur, and later reconnect the cable, then you will trigger a mergeview, which JBoss Cache does not handle (you have to handle the condition in application code via a cache listener).

                See the following JIRA task and the resulting discussion
                https://jira.jboss.org/jira/browse/JBCACHE-471

                Part of the reason this is not handled to date is that it can not correctly merge the state without having application specific knowledge. As an example, if both split groups updated the same key/value, which one do you take?

                • 5. Re: What's wrong with this configuration??
                  jason.greene

                  I am moving this thread to the core edition forum since it is a general topic.

                  • 6. Re: What's wrong with this configuration??
                    jason.greene

                    Also, to clarify my statement on Merge, this does not happen if a node crashes or is shutdown. Under such a scenario, the node will rejoin the cluster, and copy all state from its peers.

                    • 7. Re: What's wrong with this configuration??
                      nachiket_patel

                      Jason wrote:
                      ----------
                      Why are you using gossip? You dont need a gossip router to do TCP. Also gossip isn't preferable for a cache setup.
                      -----------
                      I am planning to use TCPPING but as it is creating problem, temporarily i am using Gossip.

                      When i use TCPPING then it is not reconnecting When network is disconnected from computer A then A receives new View of it's own[only 1 member A] and creator is also A. [So i can identify by the event that this computer is disconnected.]

                      But on network reconnect, nothing happens..No new view is received, no reconnection, anything (I have set shunning to true)

                      why it is behaving like this?? I read that you can use either of gossip or TCPPING, both are same in behavior. [Difference is one is running as application - GossipRouter and TCPPING is not]

                      -Nachiket

                      • 8. Re: What's wrong with this configuration??
                        nachiket_patel

                         

                        BTW, if you pull the cable long enough for a network split/partition to occur, and later reconnect the cable, then you will trigger a mergeview, which JBoss Cache does not handle (you have to handle the condition in application code via a cache listener).


                        Using ViewChanged event???

                        Part of the reason this is not handled to date is that it can not correctly merge the state without having application specific knowledge. As an example, if both split groups updated the same key/value, which one do you take?


                        So what i should do?? Stop and Start cache??? I think thats look like only option, i am unable to find something like .merge() or .update(View/Address) etc?


                        • 9. Re: What's wrong with this configuration??
                          nachiket_patel

                          My disconnect application(150.0.149.108) [which is not a merge Leader] is connected back to the network, then this messages are logged repeatedly.

                          2008-12-17 15:56:40,284 DEBUG [GMS] (ViewHandler,150.0.149.108:7957) Determining merge leader from coordinators: [150.0.149.105:7957, 150.0.149.108:7957]
                          2008-12-17 15:56:40,300 DEBUG [GMS] (ViewHandler,150.0.149.108:7957) I (150.0.149.108:7957) am not the merge leader, waiting for merge leader (150.0.149.105:7957) to initiate merge
                          


                          And at Server Application (Merge Leader-150.0.149.105) (which's Ip and port is specified in TCPPING) is not doint anything, it is just listening
                          and this messages are logged repeatedly.

                          2008-12-17 16:03:43,080 DEBUG [MERGE2] (Timer-2,150.0.149.105:7957) 150.0.149.105:7957 is looking for merge candidates, found initial_mbrs=[[own_addr=150.0.149.105:7957, coord_addr=150.0.149.105:7957, is_server=true]]


                          What is the problem??

                          Regards,
                          Nachiket



                          • 10. Re: What's wrong with this configuration??
                            nachiket_patel


                            I found it.
                            Actually for TCP,

                            Either
                            we have to specify all the host names in TCPPING initial_hosts

                            or

                            we have to use MPING .


                            Regards,
                            Nachiket

                            • 11. Re: What's wrong with this configuration??
                              nachiket_patel


                              I found it.
                              Actually for TCP,

                              Either
                              we have to specify all the host names in TCPPING initial_hosts

                              or

                              we have to use MPING .


                              Regards,
                              Nachiket