Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfashe...

[powerpc.git] / fs / ocfs2 / dlm / dlmmaster.c
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c

index 427c0af..f784177 100644 (file)
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
  
  #include "dlmapi.h"
  #include "dlmcommon.h"
-#include "dlmdebug.h"
  #include "dlmdomain.h"
  
  #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -128,11 +127,8 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
         return 1;
  }
  
-#if 0
-/* Code here is included but defined out as it aids debugging */
-
  #define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
-void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
  {
         int i;
         printk("%s=[ ", mapname);
@@ -142,7 +138,7 @@ void _dlm_print_nodemap(unsigned long *map, const char *mapname)
         printk("]");
  }
  
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
  {
         int refs;
         char *type;
@@ -189,6 +185,9 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
         printk("\n");
  }
  
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
  static void dlm_dump_mles(struct dlm_ctxt *dlm)
  {
         struct dlm_master_list_entry *mle;
@@ -709,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
  {
         struct dlm_lock_resource *res;
  
-       res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+       res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
         if (!res)
                 return NULL;
  
-       res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+       res->lockname.name = kmalloc(namelen, GFP_NOFS);
         if (!res->lockname.name) {
                 kfree(res);
                 return NULL;
@@ -741,6 +740,7 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
   */
  struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                           const char *lockid,
+                                         int namelen,
                                           int flags)
  {
         struct dlm_lock_resource *tmpres=NULL, *res=NULL;
@@ -749,13 +749,12 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
         int blocked = 0;
         int ret, nodenum;
         struct dlm_node_iter iter;
-       unsigned int namelen, hash;
+       unsigned int hash;
         int tries = 0;
         int bit, wait_on_recovery = 0;
  
         BUG_ON(!lockid);
  
-       namelen = strlen(lockid);
         hash = dlm_lockid_hash(lockid, namelen);
  
         mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
@@ -777,7 +776,7 @@ lookup:
                 mlog(0, "allocating a new resource\n");
                 /* nothing found and we need to allocate one. */
                 alloc_mle = (struct dlm_master_list_entry *)
-                       kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+                       kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                 if (!alloc_mle)
                         goto leave;
                 res = dlm_new_lockres(dlm, lockid, namelen);
@@ -867,6 +866,7 @@ lookup:
         spin_unlock(&dlm->master_lock);
         spin_unlock(&dlm->spinlock);
  
+redo_request:
         while (wait_on_recovery) {
                 /* any cluster changes that occurred after dropping the
                  * dlm spinlock would be detectable be a change on the mle,
@@ -885,7 +885,7 @@ lookup:
                 } 
  
                 dlm_kick_recovery_thread(dlm);
-               msleep(100);
+               msleep(1000);
                 dlm_wait_for_recovery(dlm);
  
                 spin_lock(&dlm->spinlock);
@@ -898,13 +898,15 @@ lookup:
                 } else
                         wait_on_recovery = 0;
                 spin_unlock(&dlm->spinlock);
+
+               if (wait_on_recovery)
+                       dlm_wait_for_node_recovery(dlm, bit, 10000);
         }
  
         /* must wait for lock to be mastered elsewhere */
         if (blocked)
                 goto wait;
  
-redo_request:
         ret = -EINVAL;
         dlm_node_iter_init(mle->vote_map, &iter);
         while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -929,6 +931,7 @@ wait:
         /* keep going until the response map includes all nodes */
         ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
         if (ret < 0) {
+               wait_on_recovery = 1;
                 mlog(0, "%s:%.*s: node map changed, redo the "
                      "master request now, blocked=%d\n",
                      dlm->name, res->lockname.len,
@@ -939,7 +942,7 @@ wait:
                              dlm->name, res->lockname.len, 
                              res->lockname.name, blocked);
                         dlm_print_one_lock_resource(res);
-                       /* dlm_print_one_mle(mle); */
+                       dlm_print_one_mle(mle);
                         tries = 0;
                 }
                 goto redo_request;
@@ -994,12 +997,14 @@ recheck:
                 spin_unlock(&res->spinlock);
                 /* this will cause the master to re-assert across
                  * the whole cluster, freeing up mles */
-               ret = dlm_do_master_request(mle, res->owner);
-               if (ret < 0) {
-                       /* give recovery a chance to run */
-                       mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
-                       msleep(500);
-                       goto recheck;
+               if (res->owner != dlm->node_num) {
+                       ret = dlm_do_master_request(mle, res->owner);
+                       if (ret < 0) {
+                               /* give recovery a chance to run */
+                               mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+                               msleep(500);
+                               goto recheck;
+                       }
                 }
                 ret = 0;
                 goto leave;
@@ -1208,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                         set_bit(node, mle->vote_map);
                 } else {
                         mlog(ML_ERROR, "node down! %d\n", node);
-
-                       /* if the node wasn't involved in mastery skip it,
-                        * but clear it out from the maps so that it will
-                        * not affect mastery of this lockres */
-                       clear_bit(node, mle->response_map);
-                       clear_bit(node, mle->vote_map);
-                       if (!test_bit(node, mle->maybe_map))
-                               goto next;
-
-                       /* if we're already blocked on lock mastery, and the
-                        * dead node wasn't the expected master, or there is
-                        * another node in the maybe_map, keep waiting */
                         if (blocked) {
                                 int lowest = find_next_bit(mle->maybe_map,
                                                        O2NM_MAX_NODES, 0);
@@ -1227,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                 /* act like it was never there */
                                 clear_bit(node, mle->maybe_map);
  
-                               if (node != lowest)
-                                       goto next;
-
-                               mlog(ML_ERROR, "expected master %u died while "
-                                    "this node was blocked waiting on it!\n",
-                                    node);
-                               lowest = find_next_bit(mle->maybe_map,
-                                                      O2NM_MAX_NODES,
-                                                      lowest+1);
-                               if (lowest < O2NM_MAX_NODES) {
-                                       mlog(0, "still blocked. waiting "
-                                            "on %u now\n", lowest);
-                                       goto next;
+                               if (node == lowest) {
+                                       mlog(0, "expected master %u died"
+                                           " while this node was blocked "
+                                           "waiting on it!\n", node);
+                                       lowest = find_next_bit(mle->maybe_map,
+                                                       O2NM_MAX_NODES,
+                                                       lowest+1);
+                                       if (lowest < O2NM_MAX_NODES) {
+                                               mlog(0, "%s:%.*s:still "
+                                                    "blocked. waiting on %u "
+                                                    "now\n", dlm->name,
+                                                    res->lockname.len,
+                                                    res->lockname.name,
+                                                    lowest);
+                                       } else {
+                                               /* mle is an MLE_BLOCK, but
+                                                * there is now nothing left to
+                                                * block on.  we need to return
+                                                * all the way back out and try
+                                                * again with an MLE_MASTER.
+                                                * dlm_do_local_recovery_cleanup
+                                                * has already run, so the mle
+                                                * refcount is ok */
+                                               mlog(0, "%s:%.*s: no "
+                                                    "longer blocking. try to "
+                                                    "master this here\n",
+                                                    dlm->name,
+                                                    res->lockname.len,
+                                                    res->lockname.name);
+                                               mle->type = DLM_MLE_MASTER;
+                                               mle->u.res = res;
+                                       }
                                 }
-
-                               /* mle is an MLE_BLOCK, but there is now
-                                * nothing left to block on.  we need to return
-                                * all the way back out and try again with
-                                * an MLE_MASTER. dlm_do_local_recovery_cleanup
-                                * has already run, so the mle refcount is ok */
-                               mlog(0, "no longer blocking. we can "
-                                    "try to master this here\n");
-                               mle->type = DLM_MLE_MASTER;
-                               memset(mle->maybe_map, 0,
-                                      sizeof(mle->maybe_map));
-                               memset(mle->response_map, 0,
-                                      sizeof(mle->maybe_map));
-                               memcpy(mle->vote_map, mle->node_map,
-                                      sizeof(mle->node_map));
-                               mle->u.res = res;
-                               set_bit(dlm->node_num, mle->maybe_map);
-
-                               ret = -EAGAIN;
-                               goto next;
                         }
  
-                       clear_bit(node, mle->maybe_map);
-                       if (node > dlm->node_num)
-                               goto next;
-
-                       mlog(0, "dead node in map!\n");
-                       /* yuck. go back and re-contact all nodes
-                        * in the vote_map, removing this node. */
-                       memset(mle->response_map, 0,
-                              sizeof(mle->response_map));
+                       /* now blank out everything, as if we had never
+                        * contacted anyone */
+                       memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+                       memset(mle->response_map, 0, sizeof(mle->response_map));
+                       /* reset the vote_map to the current node_map */
+                       memcpy(mle->vote_map, mle->node_map,
+                              sizeof(mle->node_map));
+                       /* put myself into the maybe map */
+                       if (mle->type != DLM_MLE_BLOCK)
+                               set_bit(dlm->node_num, mle->maybe_map);
                 }
                 ret = -EAGAIN;
-next:
                 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
         }
         return ret;
@@ -1539,7 +1531,7 @@ way_up_top:
                         spin_unlock(&dlm->spinlock);
  
                         mle = (struct dlm_master_list_entry *)
-                               kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+                               kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                         if (!mle) {
                                 response = DLM_MASTER_RESP_ERROR;
                                 mlog_errno(-ENOMEM);
@@ -1633,6 +1625,8 @@ again:
         dlm_node_iter_init(nodemap, &iter);
         while ((to = dlm_node_iter_next(&iter)) >= 0) {
                 int r = 0;
+               struct dlm_master_list_entry *mle = NULL;
+
                 mlog(0, "sending assert master to %d (%.*s)\n", to,
                      namelen, lockname);
                 memset(&assert, 0, sizeof(assert));
@@ -1644,20 +1638,28 @@ again:
                 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                             &assert, sizeof(assert), to, &r);
                 if (tmpret < 0) {
-                       mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+                       mlog(0, "assert_master returned %d!\n", tmpret);
                         if (!dlm_is_host_down(tmpret)) {
-                               mlog(ML_ERROR, "unhandled error!\n");
+                               mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                 BUG();
                         }
                         /* a node died.  finish out the rest of the nodes. */
-                       mlog(ML_ERROR, "link to %d went down!\n", to);
+                       mlog(0, "link to %d went down!\n", to);
                         /* any nonzero status return will do */
                         ret = tmpret;
                 } else if (r < 0) {
                         /* ok, something horribly messed.  kill thyself. */
                         mlog(ML_ERROR,"during assert master of %.*s to %u, "
                              "got %d.\n", namelen, lockname, to, r);
-                       dlm_dump_lock_resources(dlm);
+                       spin_lock(&dlm->spinlock);
+                       spin_lock(&dlm->master_lock);
+                       if (dlm_find_mle(dlm, &mle, (char *)lockname,
+                                        namelen)) {
+                               dlm_print_one_mle(mle);
+                               __dlm_put_mle(mle);
+                       }
+                       spin_unlock(&dlm->master_lock);
+                       spin_unlock(&dlm->spinlock);
                         BUG();
                 } else if (r == EAGAIN) {
                         mlog(0, "%.*s: node %u create mles on other "
@@ -1922,12 +1924,12 @@ done:
  
  kill:
         /* kill the caller! */
+       mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+            "and killing the other node now!  This node is OK and can continue.\n");
+       __dlm_print_one_lock_resource(res);
         spin_unlock(&res->spinlock);
         spin_unlock(&dlm->spinlock);
         dlm_lockres_put(res);
-       mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
-            "and killing the other node now!  This node is OK and can continue.\n");
-       dlm_dump_lock_resources(dlm);
         dlm_put(dlm);
         return -EINVAL;
  }
@@ -1937,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
                                int ignore_higher, u8 request_from, u32 flags)
  {
         struct dlm_work_item *item;
-       item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+       item = kcalloc(1, sizeof(*item), GFP_NOFS);
         if (!item)
                 return -ENOMEM;
  
@@ -1959,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
         list_add_tail(&item->list, &dlm->work_list);
         spin_unlock(&dlm->work_lock);
  
-       schedule_work(&dlm->dispatched_work);
+       queue_work(dlm->dlm_worker, &dlm->dispatched_work);
         return 0;
  }
  
@@ -2000,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
                 }
         }
  
+       /*
+        * If we're migrating this lock to someone else, we are no
+        * longer allowed to assert out own mastery.  OTOH, we need to
+        * prevent migration from starting while we're still asserting
+        * our dominance.  The reserved ast delays migration.
+        */
+       spin_lock(&res->spinlock);
+       if (res->state & DLM_LOCK_RES_MIGRATING) {
+               mlog(0, "Someone asked us to assert mastery, but we're "
+                    "in the middle of migration.  Skipping assert, "
+                    "the new master will handle that.\n");
+               spin_unlock(&res->spinlock);
+               goto put;
+       } else
+               __dlm_lockres_reserve_ast(res);
+       spin_unlock(&res->spinlock);
+
         /* this call now finishes out the nodemap
          * even if one or more nodes die */
         mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -2009,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
                                    nodemap, flags);
         if (ret < 0) {
                 /* no need to restart, we are done */
-               mlog_errno(ret);
+               if (!dlm_is_host_down(ret))
+                       mlog_errno(ret);
         }
  
+       /* Ok, we've asserted ourselves.  Let's let migration start. */
+       dlm_lockres_release_ast(dlm, res);
+
+put:
         dlm_lockres_put(res);
  
         mlog(0, "finished with dlm_assert_master_worker\n");
@@ -2050,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
                                 BUG();
                         /* host is down, so answer for that node would be
                          * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+                       ret = 0;
                 }
  
                 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2150,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
          */
  
         ret = -ENOMEM;
-       mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+       mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
         if (!mres) {
                 mlog_errno(ret);
                 goto leave;
         }
  
         mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                               GFP_KERNEL);
+                                                               GFP_NOFS);
         if (!mle) {
                 mlog_errno(ret);
                 goto leave;
@@ -2350,7 +2375,6 @@ leave:
         mlog(0, "returning %d\n", ret);
         return ret;
  }
-EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
  
  int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
  {
@@ -2614,7 +2638,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
  
         /* preallocate.. if this fails, abort */
         mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                        GFP_KERNEL);
+                                                        GFP_NOFS);
  
         if (!mle) {
                 ret = -ENOMEM;
@@ -2783,7 +2807,7 @@ top:
                                  * may result in the mle being unlinked and
                                  * freed, but there may still be a process
                                  * waiting in the dlmlock path which is fine. */
-                               mlog(ML_ERROR, "node %u was expected master\n",
+                               mlog(0, "node %u was expected master\n",
                                      dead_node);
                                 atomic_set(&mle->woken, 1);
                                 spin_unlock(&mle->spinlock);