Brian Behlendorf writes:
The root cause of the NFS hang we were observing appears to be a rare
deadlock between the kernel provided usermodehelper API and the linux NFS
client. The deadlock can arise because both of these services use the
generic linux work queues. The usermodehelper API run the specified user
application in the context of the work queue. And NFS submits both cleanup
and reconnect work to the generic work queue for handling. Normally this
is fine but a deadlock can result in the following situation.
- NFS client is in a disconnected state
- [events/0] runs a usermodehelper app with an NFS dependent operation,
this triggers an NFS reconnect.
- NFS reconnect happens to be submitted to [events/0] work queue.
- Deadlock, the [events/0] work queue will never process the
reconnect because it is blocked on the previous NFS dependent
operation which will not complete.`
The solution is simply to run reconnect requests on rpciod.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_clear_bit();
} else
clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_clear_bit();
} else
- schedule_work(&xprt->task_cleanup);
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
if (xprt_connecting(xprt))
xprt_release_write(xprt, NULL);
else
if (xprt_connecting(xprt))
xprt_release_write(xprt, NULL);
else
- schedule_work(&xprt->task_cleanup);
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
return;
out_abort:
spin_unlock(&xprt->transport_lock);
return;
out_abort:
spin_unlock(&xprt->transport_lock);
dprintk("RPC: xs_destroy xprt %p\n", xprt);
dprintk("RPC: xs_destroy xprt %p\n", xprt);
- cancel_delayed_work(&transport->connect_worker);
- flush_scheduled_work();
+ cancel_rearming_delayed_work(&transport->connect_worker);
xprt_disconnect(xprt);
xs_close(xprt);
xprt_disconnect(xprt);
xs_close(xprt);
/* Try to schedule an autoclose RPC calls */
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
/* Try to schedule an autoclose RPC calls */
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- schedule_work(&xprt->task_cleanup);
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
default:
xprt_disconnect(xprt);
}
default:
xprt_disconnect(xprt);
}
dprintk("RPC: xs_connect delayed xprt %p for %lu "
"seconds\n",
xprt, xprt->reestablish_timeout / HZ);
dprintk("RPC: xs_connect delayed xprt %p for %lu "
"seconds\n",
xprt, xprt->reestablish_timeout / HZ);
- schedule_delayed_work(&transport->connect_worker,
- xprt->reestablish_timeout);
+ queue_delayed_work(rpciod_workqueue,
+ &transport->connect_worker,
+ xprt->reestablish_timeout);
xprt->reestablish_timeout <<= 1;
if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
} else {
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
xprt->reestablish_timeout <<= 1;
if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
} else {
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
- schedule_delayed_work(&transport->connect_worker, 0);
-
- /* flush_scheduled_work can sleep... */
- if (!RPC_IS_ASYNC(task))
- flush_scheduled_work();
+ queue_delayed_work(rpciod_workqueue,
+ &transport->connect_worker, 0);