While doing network glitch testing (disconnect one server from network for 2-3 seconds) on cluster of over 100 nodes get the following Error log related to JGroups.
JGroups configuration: on IPV4 system
<config>
<UDP
mcast_addr="228.8.8.8"
enable_diagnostics="false"
discard_incompatible_packets="true"
loopback="true"
enable_bundling="false"
mcast_recv_buf_size="130000"
ucast_send_buf_size="64000"/>
<PING
timeout="6000"
num_initial_members="3"
break_on_coord_rsp="false"
num_ping_requests="2"/>
<MERGE2
max_interval="30000"
min_interval="10000"/>
<FD_ALL
interval="10000"
timeout="31000"/>
<VERIFY_SUSPECT
timeout="5000"
num_msgs="2"/>
<BARRIER/>
<pbcast.NAKACK
use_stats_for_retransmission="false"
exponential_backoff="0"
use_mcast_xmit="true" gc_lag="0"
retransmit_timeout="50,300,600,1200"
discard_delivered_msgs="true"/>
<UNICAST timeout="300,600,1200"/>
<pbcast.STABLE
stability_delay="1000"
desired_avg_gossip="50000"
max_bytes="1000000"/>
<VIEW_SYNC avg_send_interval="45000"/>
<pbcast.GMS
view_ack_collection_timeout="3000"
print_local_addr="true"
join_timeout="2000"
view_bundling="true"/>
<FRAG2 frag_size="55000"/>
<pbcast.STATE_TRANSFER/>
</config>
<139>2010-04-12 15:35:20,535 ERROR BERLIN OTHER_0000 "couldn't deliver message [dst: berlin.oz.com-39642, src: lima.oz.com-6451 (3 headers), size=0 bytes]" <VERBOSE=2537> [Thread - Incoming-2,RA_SG_BCC,berlin.oz.com-39642] (org.jgroups.protocols.UNICAST) (UNICAST.java:585)
java.lang.NullPointerException
at org.jgroups.blocks.LazyRemovalCache.removeMarkedElements(LazyRemovalCache.java:189)
at org.jgroups.blocks.LazyRemovalCache.checkMaxSizeExceeded(LazyRemovalCache.java:177)
at org.jgroups.blocks.LazyRemovalCache.retainAll(LazyRemovalCache.java:130)
at org.jgroups.blocks.LazyRemovalCache.retainAll(LazyRemovalCache.java:113)
at org.jgroups.protocols.TP.handleDownEvent(TP.java:1160)
at org.jgroups.protocols.TP.down(TP.java:840)
at org.jgroups.protocols.Discovery.down(Discovery.java:423)
at org.jgroups.protocols.MERGE2.down(MERGE2.java:171)
at org.jgroups.protocols.FD_ALL.down(FD_ALL.java:193)
at org.jgroups.protocols.VERIFY_SUSPECT.down(VERIFY_SUSPECT.java:69)
at org.jgroups.protocols.BARRIER.down(BARRIER.java:91)
at org.jgroups.protocols.pbcast.NAKACK.down(NAKACK.java:622)
at org.jgroups.protocols.UNICAST.down(UNICAST.java:408)
at org.jgroups.protocols.pbcast.STABLE.down(STABLE.java:320)
at org.jgroups.protocols.VIEW_SYNC.down(VIEW_SYNC.java:189)
at org.jgroups.protocols.pbcast.GMS.castViewChangeWithDest(GMS.java:467)
at org.jgroups.protocols.pbcast.Merger.handleMergeView(Merger.java:166)
at org.jgroups.protocols.pbcast.ServerGmsImpl.handleMergeView(ServerGmsImpl.java:37)
at org.jgroups.protocols.pbcast.GMS.up(GMS.java:825)
at org.jgroups.protocols.VIEW_SYNC.up(VIEW_SYNC.java:173)
at org.jgroups.protocols.pbcast.STABLE.up(STABLE.java:236)
at org.jgroups.protocols.UNICAST.handleDataReceived(UNICAST.java:582)
at org.jgroups.protocols.UNICAST.up(UNICAST.java:276)
at org.jgroups.protocols.pbcast.NAKACK.up(NAKACK.java:692)
at org.jgroups.protocols.BARRIER.up(BARRIER.java:120)
at org.jgroups.protocols.VERIFY_SUSPECT.up(VERIFY_SUSPECT.java:132)
at org.jgroups.protocols.FD_ALL.up(FD_ALL.java:178)
at org.jgroups.stack.Protocol.up(Protocol.java:340)
at org.jgroups.protocols.Discovery.up(Discovery.java:277)
at org.jgroups.protocols.PING.up(PING.java:67)
at org.jgroups.protocols.TP.passMessageUp(TP.java:953)
at org.jgroups.protocols.TP.access$100(TP.java:53)
at org.jgroups.protocols.TP$IncomingPacket.handleMyMessage(TP.java:1458)
at org.jgroups.protocols.TP$IncomingPacket.run(TP.java:1439)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:650)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:676)
at java.lang.Thread.run(Thread.java:595)
<END>