1 2 Previous Next 15 Replies Latest reply on Nov 28, 2005 9:41 AM by alesj

    Treecache Lock Error

    fmaredia

      System Configuration:
      JBoss AS 4.0.3 RC1
      Hibernate 2.1.8
      MySQL 4.1.7
      TreeCache (local)

      Sometimes we get read lock errors from our Cache. It happens when a user logs in or it just happens after a while. Everything works after "refreshing" the browser, but why does the following error occur:

      12:03:28,788 ERROR [IdentityLock] write lock for //com/at/hib/persistence/Employ
      ee/31 could not be acquired after 15000 ms. Locks: Read lock owners: []
      Write lock owner: <null>:48
       (caller=<null>:47, lock info: write owner=<null>:48 (activeReaders=0, activeWri
      ter=Thread[http-0.0.0.0-8080-3,5,jboss], waitingReaders=0, waitingWriters=0, wai
      tingUpgrader=0))
      12:03:28,788 ERROR [BookingDAO] org.jboss.cache.lock.TimeoutException: write loc
      k for //com/at/hib/persistence/Employee/31 could not be acquired after 15000 ms.
       Locks: Read lock owners: []
      Write lock owner: <null>:48
       (caller=<null>:47, lock info: write owner=<null>:48 (activeReaders=0, activeWri
      ter=Thread[http-0.0.0.0-8080-3,5,jboss], waitingReaders=0, waitingWriters=0, wai
      tingUpgrader=0))
      12:03:44,913 ERROR [IdentityLock] write lock for //com/at/hib/persistence/Countr
      ies/71 could not be acquired after 15000 ms. Locks: Read lock owners: [<null>:49
      ]
      Write lock owner: null
       (caller=<null>:50, lock info: read owners=[<null>:49] (activeReaders=1, activeW
      riter=null, waitingReaders=0, waitingWriters=0, waitingUpgrader=0))
      12:03:44,913 ERROR [PersonAccessDAO] org.jboss.cache.lock.TimeoutException: writ
      e lock for //com/at/hib/persistence/Countries/71 could not be acquired after 150
      00 ms. Locks: Read lock owners: [<null>:49]
      Write lock owner: null
       (caller=<null>:50, lock info: read owners=[<null>:49] (activeReaders=1, activeW
      riter=null, waitingReaders=0, waitingWriters=0, waitingUpgrader=0))
      12:03:44,913 ERROR [IdentityLock] write lock for //com/at/hib/persistence/Employ
      ee/31 could not be acquired after 15000 ms. Locks: Read lock owners: []
      Write lock owner: <null>:50
       (caller=<null>:49, lock info: write owner=<null>:50 (activeReaders=0, activeWri
      ter=Thread[http-0.0.0.0-8080-3,5,jboss], waitingReaders=0, waitingWriters=0, wai
      tingUpgrader=0))
      12:03:44,913 ERROR [BookingDAO] org.jboss.cache.lock.TimeoutException: write loc
      k for //com/at/hib/persistence/Employee/31 could not be acquired after 15000 ms.
       Locks: Read lock owners: []
      Write lock owner: <null>:50
       (caller=<null>:49, lock info: write owner=<null>:50 (activeReaders=0, activeWri
      ter=Thread[http-0.0.0.0-8080-3,5,jboss], waitingReaders=0, waitingWriters=0, wai
      tingUpgrader=0))


      Code for TreeCache.xml is the following:
      <server>
      
       <classpath codebase="./lib" archives="jboss-cache.jar, jgroups.jar"/>
      
      
       <!-- ==================================================================== -->
       <!-- Defines TreeCache configuration -->
       <!-- ==================================================================== -->
      
       <mbean code="org.jboss.cache.TreeCache"
       name="jboss.cache:service=TreeCache">
      
       <depends>jboss:service=Naming</depends>
       <depends>jboss:service=TransactionManager</depends>
      
       <!--
       Configure the TransactionManager
       -->
       <attribute name="TransactionManagerLookupClass">org.jboss.cache.JBossTransactionManagerLookup</attribute>
      
      
       <!--
       Node isolation level : SERIALIZABLE
       REPEATABLE_READ (default)
       READ_COMMITTED
       READ_UNCOMMITTED
       NONE
       -->
       <attribute name="IsolationLevel">REPEATABLE_READ</attribute>
      
       <!--
       Valid modes are LOCAL
       REPL_ASYNC
       REPL_SYNC
       -->
       <attribute name="CacheMode">LOCAL</attribute>
      
       <!-- Name of cluster. Needs to be the same for all clusters, in order
       to find each other
       -->
       <attribute name="ClusterName">TreeCache-Cluster</attribute>
      
       <!-- JGroups protocol stack properties. Can also be a URL,
       e.g. file:/home/bela/default.xml
       <attribute name="ClusterProperties"></attribute>
       -->
      
       <attribute name="ClusterConfig">
       <config>
       <!-- UDP: if you have a multihomed machine,
       set the bind_addr attribute to the appropriate NIC IP address -->
       <!-- UDP: On Windows machines, because of the media sense feature
       being broken with multicast (even after disabling media sense)
       set the loopback attribute to true -->
       <UDP mcast_addr="228.1.2.3" mcast_port="45566"
       ip_ttl="64" ip_mcast="true"
       mcast_send_buf_size="150000" mcast_recv_buf_size="80000"
       ucast_send_buf_size="150000" ucast_recv_buf_size="80000"
       loopback="false"/>
       <PING timeout="2000" num_initial_members="3"
       up_thread="false" down_thread="false"/>
       <MERGE2 min_interval="10000" max_interval="20000"/>
       <FD shun="true" up_thread="true" down_thread="true"/>
       <VERIFY_SUSPECT timeout="1500"
       up_thread="false" down_thread="false"/>
       <pbcast.NAKACK gc_lag="50" retransmit_timeout="600,1200,2400,4800"
       up_thread="false" down_thread="false"/>
       <pbcast.STABLE desired_avg_gossip="20000"
       up_thread="false" down_thread="false"/>
       <UNICAST timeout="600,1200,2400" window_size="100" min_threshold="10"
       down_thread="false"/>
       <FRAG frag_size="8192"
       down_thread="false" up_thread="false"/>
       <pbcast.GMS join_timeout="5000" join_retry_timeout="2000"
       shun="true" print_local_addr="true"/>
       <pbcast.STATE_TRANSFER up_thread="false" down_thread="false"/>
       </config>
       </attribute>
      
       <!--
       Max number of entries in the cache. If this is exceeded, the
       eviction policy will kick some entries out in order to make
       more room
       -->
       <attribute name="MaxCapacity">20000</attribute>
      
       <!--
       The max amount of time (in milliseconds) we wait until the
       initial state (ie. the contents of the cache) are retrieved from
       existing members in a clustered environment
       -->
       <attribute name="InitialStateRetrievalTimeout">20000</attribute>
      
       <!--
       Number of milliseconds to wait until all responses for a
       synchronous call have been received.
       -->
       <attribute name="SyncReplTimeout">10000</attribute>
      
       <!-- Max number of milliseconds to wait for a lock acquisition -->
       <attribute name="LockAcquisitionTimeout">15000</attribute>
      
       <!-- Max number of milliseconds we hold a lock (not currently
       implemented) -->
       <attribute name="LockLeaseTimeout">60000</attribute>
      
       <!-- Name of the eviction policy class. Not supported now. -->
       <attribute name="EvictionPolicyClass"></attribute>
       </mbean>
      
      
      </server>


        • 1. Re: Treecache Lock Error
          rino_salvade

          We experienced a similar problem when we tried to access the same node that was modified again within the treecache listener (I do not know if that is the case with you). To our opinion the problem is that the lock and unlock interceptor use the thread as key for their lock table and not the method. This causes the problem that the lock table becomes empty but the node still maintains the lock.
          To solve the problem we changed the lock and unlock interceptors and made the method the key of the table. With this the problem went away.

          • 2. Re: Treecache Lock Error
            belaban

             

            "rino_salvade" wrote:
            We experienced a similar problem when we tried to access the same node that was modified again within the treecache listener (I do not know if that is the case with you).


            This is not recommended, and you will almost always run into problems, especially in the distributed case. There is a JIRA issue open for this: http://jira.jboss.com/jira/browse/JBCACHE-49


            To our opinion the problem is that the lock and unlock interceptor use the thread as key for their lock table and not the method.


            Note that this is only the case if no transaction is active for the current thread.


            This causes the problem that the lock table becomes empty but the node still maintains the lock.


            Come again, didn't understand what you said here ?


            To solve the problem we changed the lock and unlock interceptors and made the method the key of the table. With this the problem went away.


            What if multiple threads are calling the same method concurrently ? Then you would grant the lock on a given node to both threads ?

            • 3. Re: Treecache Lock Error
              rino_salvade

               


              Note that this is only the case if no transaction is active for the current thread.

              This is correct, but as far a I have read the config file of the original poster, this is the case.
              If we do the same stuff with a transaction manager we experienced a similar problem (especially in the rollback case). Here the point is that the transaction state rolling_back is not seen as a valid state and therefore the lock and unlck interceptor uses the same technique as if no transaction manager was present.
              Come again, didn't understand what you said here ?

              Ok, This might be a lengthy explanation and it is based on my understanding of the code. The assumption is that we have no transaction manager defined.
              For each call the lock interceptor creates a new listwith the lock entries. This list is kept in the lock_table with the current thread as key. The unlock interceptor then removes all these locks again. So far this is fine.
              But when we call within the notification of the TreeCacheListener again a get or put method then the lock interceptor creates again a new list with locks and replaces the one of the previous call. The unlock interceptor will then release only the locks that are part of the second list and remove the entry from the lock_table. So when the unlock interceptor of the original call is executed no list with locks to remove is there anymore. This means that all locks that have been acquired by the first call are not removed. But because the node itself maintains its lock as well, the lock remains on the node.

              What if multiple threads are calling the same method concurrently ? Then you would grant the lock on a given node to both threads ?

              Sorry for that, I was unclear on that issue. We use a combined key based on the thread and the method. Otherwise this would really cause problems.

              • 4. Re: Treecache Lock Error
                belaban

                 

                "rino_salvade" wrote:

                Note that this is only the case if no transaction is active for the current thread.

                This is correct, but as far a I have read the config file of the original poster, this is the case.


                No, he clearly defined a TransactionManager. So if he called tx.begin() we'd have a valid TX ! This doesn't mean he's calling tx.begin() though, might as well run without a TX...



                For each call the lock interceptor creates a new list with the lock entries. This list is kept in the lock_table with the current thread as key. The unlock interceptor then removes all these locks again. So far this is fine.
                But when we call within the notification of the TreeCacheListener again a get or put method then the lock interceptor creates again a new list with locks and replaces the one of the previous call.


                This is not correct; the callback runs in the *same* thread, so the lock table will already hold the lock and therefore use the same lock list. We don't create a new lock entry if we already have one.


                The unlock interceptor will then release only the locks that are part of the second list and remove the entry from the lock_table. So when the unlock interceptor of the original call is executed no list with locks to remove is there anymore. This means that all locks that have been acquired by the first call are not removed. But because the node itself maintains its lock as well, the lock remains on the node.



                Maybe my statement above is incorrect; in this case (since you experienced it), can you submit a unit test that shows this behavior ? Look at CallbackTest.testLocalCallbackWithoutTransaction(), and see whether your test could fit in there. This would be tremendously useful, cause if you are correct we have a major bug. I don't really care about the put(), but I do care about the get(), which is allowed (and must to do something useful!) to be called from within a callback.


                What if multiple threads are calling the same method concurrently ? Then you would grant the lock on a given node to both threads ?


                • 5. Re: Treecache Lock Error
                  belaban

                  Okay, I looked at this some more, and you are correct: I modified the CallbackTest.testLocalCallbackWithoutTransaction() method to show the problem - no need for you to write a unit test.
                  I'll create a JIRA issue for this (for 1.2.4).

                  • 6. Re: Treecache Lock Error
                    belaban
                    • 7. Re: Treecache Lock Error
                      rino_salvade

                      You're faster than me :-)

                      • 8. Re: Treecache Lock Error
                        belaban

                        I'm adding some unit tests, looks like this only occurs if you put() or get() an Fqn that is *different* than the Fqn passed in nodeCreated().
                        This is clearly a bug, it doesn't affect transactions though (but I'll check).
                        Expect this to be fixed in 1.2.4

                        • 9. Re: Treecache Lock Error
                          belaban

                          Okay, I fixed the above JIRA issue. But it is not nice - I essentially go back to the LockTable to re-read the correct lock value.
                          I have added a comment to http://jira.jboss.com/jira/browse/JBCACHE-270, which explains a cleaner solution (will be done probably in 1.2.5, with the new NotificationInterceptor):
                          This only occurs without transactions !

                          We also need to discuss *when* notifications are emitted. For example, now a nodeCreated() notification is emitted after the node was created, but *before* it is locked !
                          So a put("/a") with a nodeCreated("/a") callback of get("/a") will
                          - acquire a RL on "/a" (callback)
                          - release the RL (UnlockInterceptor of get())
                          - acquire a WL on "/a" (put())
                          - release a WL on "/a"

                          The better scenario would be:
                          - put("/a") acquired WL on "/a"
                          - get("/a") needs RL, but already has WL, fine
                          - get() releases the WL (UnlockInterceptor)
                          - put() (UnlockInt) doesn't have anything left to remove

                          • 10. Re: Treecache Lock Error
                            rino_salvade

                             

                            The better scenario would be:
                            - put("/a") acquired WL on "/a"
                            - get("/a") needs RL, but already has WL, fine
                            - get() releases the WL (UnlockInterceptor)
                            - put() (UnlockInt) doesn't have anything left to remove


                            To my point of view it would be better if the put() (UnlockInt) would release the WL. I agree that to prevent race conditions one should use a transaction manager but I think most developers would assume that you keep the write lock till your call to put has finished so the above stated schema seems to me a bit problematic.

                            • 11. Re: Treecache Lock Error
                              belaban

                              Yes, that's right. However, I'm not changing this right now, as we will handle this all in the new NotificationInterceptor. Can you make a comment on http://jira.jboss.com/jira/browse/JBCACHE-270 ?

                              • 12. Re: Treecache Lock Error
                                rino_salvade

                                I added the comment.

                                • 13. Re: Treecache Lock Error
                                  portuzerus

                                  Hi,
                                  We are using Hibernate 3.0.3 with Synchronous replication cache (TreeCache)

                                  <!--
                                   Valid modes are LOCAL
                                   REPL_ASYNC
                                   REPL_SYNC
                                   -->
                                   <attribute name="CacheMode">REPL_SYNC</attribute>
                                  


                                  we have 2 Servers, in each server, I have a lot of message similar at

                                  12:03:28,788 ERROR [IdentityLock] write lock for //com/cubika/app/domain/operators/Operator
                                  ee/31 could not be acquired after 15000 ms. Locks: Read lock owners: []
                                  Write lock owner: <null>:48
                                   (caller=<null>:47, lock info: write owner=<null>:48 (activeReaders=0, activeWri
                                  ter=Thread[http-0.0.0.0-8080-3,5,jboss], waitingReaders=0, waitingWriters=0, wai
                                  tingUpgrader=0))
                                  12:03:28,788 ERROR [BookingDAO] org.jboss.cache.lock.TimeoutException: write loc
                                  k for ///com/cubika/app/domain/operators/Adherent/5 could not be acquired after 15000 ms.
                                   Locks: Read lock owners: []
                                  Write lock owner: <null>:48
                                   (caller=<null>:47, lock info: write owner=<null>:48 (activeReaders=0, activeWri
                                  ter=Thread[http-0.0.0.0-8080-3,5,jboss], waitingReaders=0, waitingWriters=0, wai
                                  tingUpgrader=0))
                                  12:03:44,913 ERROR [IdentityLock] write lock for ////com/cubika/app/domain/bank/Bank
                                  ies/71 could not be acquired after 15000 ms. Locks: Read lock owners: [<null>:49
                                  ]
                                  Write lock owner: null
                                   (caller=<null>:50, lock info: read owners=[<null>:49] (activeReaders=1, activeW
                                  riter=null, waitingReaders=0, waitingWriters=0, waitingUpgrader=0)
                                  



                                  When the servers are started the application is fast, after 2 hours the appliacion begin
                                  to become so slowly. Realy I don'd why happed that. Somebody can say something to me on those errors?

                                  Alexis E. Quirós
                                  Buenos Aires - Argentina
                                  aquiros@cubika.com


                                  • 14. Re: Treecache Lock Error
                                    neotyk

                                    Hi,
                                    I get following exception:

                                    server.log wrote:
                                    Caused by: org.jboss.cache.lock.TimeoutException: write lock for /RawFrameCache/40995 could not be acquired after 15000 ms. Lock map ownership Read lock owners: []
                                    Write lock owner: < null>:272101
                                    (caller=< null>:272112)
                                    at org.jboss.cache.lock.IdentityLock.acquireWriteLock(IdentityLock.java:179)
                                    at org.jboss.cache.Node.acquireWriteLock(Node.java:424)
                                    at org.jboss.cache.Node.acquire(Node.java:401)
                                    at org.jboss.cache.interceptors.LockInterceptor.lock(LockInterceptor.java:184)
                                    at org.jboss.cache.interceptors.LockInterceptor.invoke(LockInterceptor.java:134)
                                    at org.jboss.cache.interceptors.Interceptor.invoke(Interceptor.java:40)
                                    at org.jboss.cache.interceptors.CreateIfNotExistsInterceptor.invoke(CreateIfNotExistsInterceptor.java:47)
                                    at org.jboss.cache.TreeCache.invokeMethod(TreeCache.java:3132)
                                    at org.jboss.cache.TreeCache.remove(TreeCache.java:1890)
                                    at sun.reflect.GeneratedMethodAccessor87.invoke(Unknown Source)
                                    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
                                    at java.lang.reflect.Method.invoke(Method.java:324)
                                    at org.jboss.mx.interceptor.ReflectedDispatcher.invoke(ReflectedDispatcher.java:144)
                                    at org.jboss.mx.server.Invocation.dispatch(Invocation.java:80)
                                    at org.jboss.mx.server.Invocation.invoke(Invocation.java:72)
                                    at org.jboss.mx.server.AbstractMBeanInvoker.invoke(AbstractMBeanInvoker.java:249)
                                    at org.jboss.mx.server.MBeanServerImpl.invoke(MBeanServerImpl.java:642)
                                    at org.jboss.mx.util.MBeanProxyExt.invoke(MBeanProxyExt.java:177)
                                    at $Proxy210.remove(Unknown Source)
                                    at net.kungfoo.j2ee.dao.cache.CacheHelper.removeObjectFromCache(CacheHelper.java:152)
                                    at net.kungfoo.j2ee.dao.cache.CacheHooverFramesDAOImpl.updateCurrentFrame(CacheHooverFramesDAOImpl.java:114)
                                    ... 30 more

                                    This exception ocures after about 3 days from application startup, operations executed in server are all the time the same, there is
                                    nothing special about the moment locks break, at least nothing special in application.

                                    I don't use replicated cache. Code accessing JBossCache is run in thread thit associated transactions.
                                    ejb-jar.xml wrote:
                                    ...
                                    < container-transaction >
                                    < method >
                                    < ejb-name>FramesMDB< /ejb-name>
                                    < method-name>*< /method-name>
                                    < /method>
                                    < trans-attribute>Required< /trans-attribute>
                                    < /container-transaction>
                                    ...

                                    This error is present on both JBossAS 4.0.1 and 4.0.2.

                                    Here is my JBossCache configuration:
                                    cache-service.xml wrote:
                                    < ?xml version="1.0" encoding="UTF-8"?>
                                    < server>
                                    < classpath codebase="./lib" archives="jboss-cache.jar, jgroups.jar"/>
                                    < mbean code="org.jboss.cache.TreeCache"
                                    name="jboss.cache:service=TreeCache">

                                    < depends>jboss:service=Naming< /depends>
                                    < depends>jboss:service=TransactionManager< /depends>
                                    < attribute name="TransactionManagerLookupClass">org.jboss.cache.JBossTransactionManagerLookup< /attribute>
                                    < attribute name="IsolationLevel">READ_UNCOMMITTED< /attribute>
                                    < attribute name="CacheMode">LOCAL< /attribute>
                                    < attribute name="UseReplQueue">false< /attribute>
                                    < attribute name="ReplQueueInterval">0< /attribute>
                                    < attribute name="ReplQueueMaxElements">0< /attribute>
                                    < attribute name="FetchStateOnStartup">true< /attribute>
                                    < attribute name="InitialStateRetrievalTimeout">5000< /attribute>
                                    < attribute name="SyncReplTimeout">10000< /attribute>
                                    < attribute name="LockAcquisitionTimeout">15000< /attribute>
                                    < attribute name="EvictionPolicyClass">org.jboss.cache.eviction.LRUPolicy< /attribute>
                                    < attribute name="EvictionPolicyConfig">
                                    < config>
                                    < attribute name="wakeUpIntervalSeconds">5< /attribute>
                                    < region name="/_default_">
                                    < attribute name="maxNodes">0< /attribute>
                                    < attribute name="timeToLiveSeconds">10000< /attribute>
                                    < /region>
                                    < /config>
                                    < /attribute>
                                    < /mbean>
                                    < /server>

                                    Please help solving this problem, I have no idea how to fix this issue.
                                    Thanks i advance,
                                    Hubert.

                                    1 2 Previous Next