4 Replies Latest reply on Mar 25, 2018 4:04 PM by vikrant02

Cross DC Relay2 - Master node doesn't handover cluster information to new master during failure

vikrant02 Feb 6, 2018 11:09 AM

Hi,

We have Infinispan setup in openshift environment deployed across two data centers. we are using kubernetes jgroups stack to form local cluster and tcp stack with tcpping as discovery protocol to form cross site bridge.

Following is the jgroups configuration

<subsystem
      xmlns="urn:infinispan:server:jgroups:9.0">
      <channels default="cluster">
        <channel name="cluster"/>
        <channel name="xsite" stack="tcp"/>
      </channels>
      <stacks default="${jboss.default.jgroups.stack:kubernetes}">
        <stack name="tcp">
          <transport type="TCP" socket-binding="jgroups-tcp">
            <property name="external_addr">${jgroups.tcp.external_addr:}</property>
          </transport>
          <protocol type="TCPPING">
            <property name="initial_hosts">${jgroups.tcpping.initial_hosts:}</property>
            <property name="ergonomics">false</property>
          </protocol>
          <protocol type="MERGE3">
            <property name="min_interval">10000</property>
            <property name="max_interval">30000</property>
          </protocol>
          <protocol type="FD_SOCK" socket-binding="jgroups-tcp-fd"/>
          <protocol type="FD_ALL">
            <property name="timeout">60000</property>
            <property name="interval">15000</property>
            <property name="timeout_check_interval">5000</property>
          </protocol>
          <protocol type="VERIFY_SUSPECT">
            <property name="timeout">5000</property>
          </protocol>
          <protocol type="pbcast.NAKACK2">
            <property name="use_mcast_xmit">false</property>
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="resend_last_seqno">true</property>
          </protocol>
          <protocol type="UNICAST3">
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="conn_expiry_timeout">0</property>
          </protocol>
          <protocol type="pbcast.STABLE">
            <property name="stability_delay">500</property>
            <property name="desired_avg_gossip">5000</property>
            <property name="max_bytes">1M</property>
          </protocol>
          <protocol type="pbcast.GMS">
            <property name="print_local_addr">true</property>
            <property name="install_view_locally_first">true</property>
            <property name="join_timeout">${jgroups.join_timeout:5000}</property>
          </protocol>
          <protocol type="MFC">
            <property name="max_credits">2m</property>
            <property name="min_threshold">0.40</property>
          </protocol>
          <protocol type="FRAG3"/>
          <protocol type="RSVP"/>
        </stack>
        <stack name="kubernetes">
          <transport type="TCP" socket-binding="jgroups-tcp">
            <property name="logical_addr_cache_expiration">360000</property>
          </transport>
          <protocol type="kubernetes.KUBE_PING"/>
          <protocol type="MERGE3">
            <property name="min_interval">10000</property>
            <property name="max_interval">30000</property>
          </protocol>
          <protocol type="FD_SOCK" socket-binding="jgroups-tcp-fd"/>
          <protocol type="FD_ALL">
            <property name="timeout">60000</property>
            <property name="interval">15000</property>
            <property name="timeout_check_interval">5000</property>
          </protocol>
          <protocol type="VERIFY_SUSPECT">
            <property name="timeout">5000</property>
          </protocol>
          <protocol type="pbcast.NAKACK2">
            <property name="use_mcast_xmit">false</property>
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="resend_last_seqno">true</property>
          </protocol>
          <protocol type="UNICAST3">
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="conn_expiry_timeout">0</property>
          </protocol>
          <protocol type="pbcast.STABLE">
            <property name="stability_delay">500</property>
            <property name="desired_avg_gossip">5000</property>
            <property name="max_bytes">1M</property>
          </protocol>
          <protocol type="pbcast.GMS">
            <property name="print_local_addr">true</property>
            <property name="install_view_locally_first">true</property>
            <property name="join_timeout">${jgroups.join_timeout:5000}</property>
          </protocol>
          <protocol type="MFC">
            <property name="max_credits">2m</property>
            <property name="min_threshold">0.40</property>
          </protocol>
          <protocol type="FRAG3"/>
          <relay site="SITE1">
            <property name="relay_multicasts">false</property>
            <property name="max_site_masters">2</property>
          </relay>
        </stack>
      </stacks>
</subsystem>

Cluster startup is sequential i.e. first site1 starts and then site2. We are populating initial_hosts list based on the available pods during the infinispan startup i.e. first infinispan in site1 will have only its own hostname in initial_hosts list but the last infinispan in site2 will contain hostnames of all infinispans in its initial_hosts list.

Since it is an openshift deployment we will not know the ip addresses of infinispan until the pod starts, which means we can't pre-populate initial_hosts with ip address of all infinispan.

After initial sequential startup both data centers are successfully able to join cross DC bridge but if the master in site1 goes down the master address for site2 doesn't gets passed to new master in site1 and new master needs to do discovery again since infinispan pods in site1 started first the initial_hosts doesn't contain hostname for site2 master and cross site bridge gets broken till site2 master dos the discovery again. This leads to cross dc replication downtime of approx 1 mins every time.

Is there any way to make sure that if site master goes down then the new site master gets info of other site's master, or any other way to manage cross DC in dynamic openshift environment.

Thanks,

Vikrant

1. Re: Cross DC Relay2 - Master node doesn't handover cluster information to new master during failure

sebastian.laskawiec Feb 7, 2018 4:58 AM (in response to vikrant02)
Sounds like a very complicated issue. It might be worth to ask belaban, galder.zamarreno and pferraro.

Here are a couple of things I can spot:
At first you need to form a separate cluster in each DC. I see you already included `KUBE_PING` in your configuration so this part should be solved. I'm assuming at this point that each DC contains a well-formed cluster of Infinispan nodes.
`RELAY2` protocol (the one that is heavily used during x-site replication) requires forming additional, global cluster. The main problem here is to discover its members. For this I suggest to allocate a Load-Balancer Service on each site (this creates an externally reachable Load Balancer that forwards traffic to your Pods) and use obtained Load-Balancer IP addresses for `TCPPING`. The only thing that I'm not entirely sure is how the selector should be defined on the Service. Should it select all the Pods or just one of them (a coordinator)? I would need to fun a couple of experiments to check this.
With that kind of setup you should be able to start both DC in parallel and solve the downtime issue.
Actions
2. Re: Cross DC Relay2 - Master node doesn't handover cluster information to new master during failure

vikrant02 Feb 15, 2018 7:51 AM (in response to sebastian.laskawiec)
Thanks for the suggestion. Need to try that setup to see if it can work. However I can see few complication with load balancing service.
if we have all pods in service selector then there is chance that service may not point to current master which will result in discovery failure.
If we want to point load balancer service to master node then how to repopulate the service once current master node goes down and new pod takes over master responsibility.

belaban, galder.zamarreno, pferraro any suggestion would be very helpful.
Actions
3. Re: Cross DC Relay2 - Master node doesn't handover cluster information to new master during failure

sebastian.laskawiec Feb 21, 2018 9:56 AM (in response to vikrant02)

After experimenting a little bit it seems that setting the maximum site masters to some high number seems to work. This allows all nodes to become site master and replicate data between DC. Here's the configuration snippet:

<relay site="site-1">
    <remote-site name="site-2" stack="relay-global" cluster="global"/>
    <property name="relay_multicasts">false</property>
    <property name="max_site_masters">1000</property>
</relay>

Let me know how it goes!
Actions

4. Re: Cross DC Relay2 - Master node doesn't handover cluster information to new master during failure

vikrant02 Mar 25, 2018 4:04 PM (in response to sebastian.laskawiec)

Hi,

It took sometime before I could test it. I have updated max no of masters to 3 but now there is issue during initial discovery.

First three infinispan instances starts in site1 all of them are defined as master and each one of then starts xsite bridge site1. Then first infinispan node from site2 starts up which joins xsite bridge with any one of the infinispan instance from site1 but other 2 instances from site1 doesn't joins the site2 xsite bridge immediately, it takes couple of minutes before other 2 instances of site1 joins site2 xsite bridge and it is not consistent always, there are instances when other 2 instances never able to join site2 xsite bridge.

I am not sure if I am missing some settings here but as per my understanding if there are multiple master in a site then they all should have same xsite view but here all master instances in same site is holding a different xsite view.

Here is "view" attribute value for all three nodes from "jboss.as:subsystem=datagrid-jgroups,channel=xsite" mbean for site1 nodes after 3 infinispan instances running in both site1 and site2

[_node0:site1|5] (4) [_node0:site1, _node0:site2, _node1:site2, _node2:site2]
[_node1:site1|2] (2) [_node1:site1, _node0:site2]
[_node2:site1|1] (2) [_node2:site1, _node0:site2]

Following is the jgroups configuration currently in use

<subsystem
      xmlns="urn:infinispan:server:jgroups:9.0">
      <channels default="cluster">
        <channel name="cluster"/>
        <channel name="xsite" stack="tcp"/>
      </channels>
      <stacks default="${jboss.default.jgroups.stack:kubernetes}">
        <stack name="tcp">
          <transport type="TCP" socket-binding="jgroups-tcp">
            <property name="external_addr">${jgroups.tcp.external_addr:}</property>
          </transport>
          <protocol type="TCPPING">
            <property name="initial_hosts">${jgroups.tcpping.initial_hosts:}</property>
            <property name="ergonomics">false</property>
          </protocol>
          <protocol type="MERGE3">
            <property name="min_interval">10000</property>
            <property name="max_interval">30000</property>
          </protocol>
          <protocol type="FD_SOCK" socket-binding="jgroups-tcp-fd"/>
          <protocol type="FD_ALL">
            <property name="timeout">60000</property>
            <property name="interval">15000</property>
            <property name="timeout_check_interval">5000</property>
          </protocol>
          <protocol type="VERIFY_SUSPECT">
            <property name="timeout">5000</property>
          </protocol>
          <protocol type="pbcast.NAKACK2">
            <property name="use_mcast_xmit">false</property>
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="resend_last_seqno">true</property>
          </protocol>
          <protocol type="UNICAST3">
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="conn_expiry_timeout">0</property>
          </protocol>
          <protocol type="pbcast.STABLE">
            <property name="stability_delay">500</property>
            <property name="desired_avg_gossip">5000</property>
            <property name="max_bytes">1M</property>
          </protocol>
          <protocol type="pbcast.GMS">
            <property name="print_local_addr">true</property>
            <property name="install_view_locally_first">true</property>
            <property name="join_timeout">${jgroups.join_timeout:5000}</property>
          </protocol>
          <protocol type="MFC">
            <property name="max_credits">2m</property>
            <property name="min_threshold">0.40</property>
          </protocol>
          <protocol type="FRAG3"/>
          <protocol type="RSVP"/>
        </stack>
        <stack name="kubernetes">
          <transport type="TCP" socket-binding="jgroups-tcp">
            <property name="logical_addr_cache_expiration">360000</property>
          </transport>
          <protocol type="kubernetes.KUBE_PING"/>
          <protocol type="MERGE3">
            <property name="min_interval">10000</property>
            <property name="max_interval">30000</property>
          </protocol>
          <protocol type="FD_SOCK" socket-binding="jgroups-tcp-fd"/>
          <protocol type="FD_ALL">
            <property name="timeout">60000</property>
            <property name="interval">15000</property>
            <property name="timeout_check_interval">5000</property>
          </protocol>
          <protocol type="VERIFY_SUSPECT">
            <property name="timeout">5000</property>
          </protocol>
          <protocol type="pbcast.NAKACK2">
            <property name="use_mcast_xmit">false</property>
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="resend_last_seqno">true</property>
          </protocol>
          <protocol type="UNICAST3">
            <property name="xmit_interval">100</property>
            <property name="xmit_table_num_rows">50</property>
            <property name="xmit_table_msgs_per_row">1024</property>
            <property name="xmit_table_max_compaction_time">30000</property>
            <property name="conn_expiry_timeout">0</property>
          </protocol>
          <protocol type="pbcast.STABLE">
            <property name="stability_delay">500</property>
            <property name="desired_avg_gossip">5000</property>
            <property name="max_bytes">1M</property>
          </protocol>
          <protocol type="pbcast.GMS">
            <property name="print_local_addr">true</property>
            <property name="install_view_locally_first">true</property>
            <property name="join_timeout">${jgroups.join_timeout:5000}</property>
          </protocol>
          <protocol type="MFC">
            <property name="max_credits">2m</property>
            <property name="min_threshold">0.40</property>
          </protocol>
          <protocol type="FRAG3"/>
          <relay site="${env.LOCAL_SITE}">
                        <remote-site name="site2" stack="tcp" cluster="xsite"/>
            <property name="relay_multicasts">false</property>
            <property name="max_site_masters">3</property>
          </relay>
        </stack>
      </stacks>
    </subsystem>

Please let me know if there is some configuration is missing from my side or there is any other way to set this up.

Thanks for all the help!!

-Vikrant

Go to original post