2 * linux/net/sunrpc/xprt.c
4 * This is a generic RPC call interface supporting congestion avoidance,
5 * and asynchronous calls.
7 * The interface works like this:
9 * - When a process places a call, it allocates a request slot if
10 * one is available. Otherwise, it sleeps on the backlog queue
12 * - Next, the caller puts together the RPC message, stuffs it into
13 * the request struct, and calls xprt_call().
14 * - xprt_call transmits the message and installs the caller on the
15 * socket's wait list. At the same time, it installs a timer that
16 * is run after the packet's timeout has expired.
17 * - When a packet arrives, the data_ready handler walks the list of
18 * pending requests for that socket. If a matching XID is found, the
19 * caller is woken up, and the timer removed.
20 * - When no reply arrives within the timeout interval, the timer is
21 * fired by the kernel and runs xprt_timer(). It either adjusts the
22 * timeout values (minor timeout) or wakes up the caller with a status
24 * - When the caller receives a notification from RPC that a reply arrived,
25 * it should release the RPC slot, and process the reply.
26 * If the call timed out, it may choose to retry the operation by
27 * adjusting the initial timeout value, and simply calling rpc_call
30 * Support for async RPC is done through a set of RPC-specific scheduling
31 * primitives that `transparently' work for processes as well as async
32 * tasks that rely on callbacks.
34 * Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de>
36 * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
37 * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
38 * TCP NFS related read + write fixes
39 * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
41 * Rewrite of larges part of the code in order to stabilize TCP stuff.
42 * Fix behaviour when socket buffer is full.
43 * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
46 #define __KERNEL_SYSCALLS__
48 #include <linux/version.h>
49 #include <linux/types.h>
50 #include <linux/slab.h>
51 #include <linux/capability.h>
52 #include <linux/sched.h>
53 #include <linux/errno.h>
54 #include <linux/socket.h>
56 #include <linux/net.h>
58 #include <linux/udp.h>
59 #include <linux/unistd.h>
60 #include <linux/sunrpc/clnt.h>
61 #include <linux/file.h>
64 #include <net/checksum.h>
68 #include <asm/uaccess.h>
75 # undef RPC_DEBUG_DATA
76 # define RPCDBG_FACILITY RPCDBG_XPRT
79 #define XPRT_MAX_BACKOFF (8)
84 static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
85 static void do_xprt_transmit(struct rpc_task *);
86 static void xprt_reserve_status(struct rpc_task *task);
87 static void xprt_disconnect(struct rpc_xprt *);
88 static void xprt_reconn_status(struct rpc_task *task);
89 static struct socket *xprt_create_socket(int, struct rpc_timeout *);
90 static int xprt_bind_socket(struct rpc_xprt *, struct socket *);
91 static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
95 * Print the buffer contents (first 128 bytes only--just enough for
99 xprt_pktdump(char *msg, u32 *packet, unsigned int count)
101 u8 *buf = (u8 *) packet;
104 dprintk("RPC: %s\n", msg);
105 for (j = 0; j < count && j < 128; j += 4) {
109 dprintk("0x%04x ", j);
111 dprintk("%02x%02x%02x%02x ",
112 buf[j], buf[j+1], buf[j+2], buf[j+3]);
118 xprt_pktdump(char *msg, u32 *packet, unsigned int count)
125 * Look up RPC transport given an INET socket
127 static inline struct rpc_xprt *
128 xprt_from_sock(struct sock *sk)
130 return (struct rpc_xprt *) sk->user_data;
134 * Serialize write access to sockets, in order to prevent different
135 * requests from interfering with each other.
136 * Also prevents TCP socket reconnections from colliding with writes.
139 __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
141 if (!xprt->snd_task) {
142 if (xprt->nocong || __xprt_get_cong(xprt, task))
143 xprt->snd_task = task;
145 if (xprt->snd_task != task) {
146 dprintk("RPC: %4d TCP write queue full\n", task->tk_pid);
147 task->tk_timeout = 0;
148 task->tk_status = -EAGAIN;
149 if (task->tk_rqstp && task->tk_rqstp->rq_nresend)
150 rpc_sleep_on(&xprt->resend, task, NULL, NULL);
152 rpc_sleep_on(&xprt->sending, task, NULL, NULL);
154 return xprt->snd_task == task;
158 xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
161 spin_lock_bh(&xprt->sock_lock);
162 retval = __xprt_lock_write(xprt, task);
163 spin_unlock_bh(&xprt->sock_lock);
168 __xprt_lock_write_next(struct rpc_xprt *xprt)
170 struct rpc_task *task;
174 task = rpc_wake_up_next(&xprt->resend);
176 if (!xprt->nocong && RPCXPRT_CONGESTED(xprt))
178 task = rpc_wake_up_next(&xprt->sending);
182 if (xprt->nocong || __xprt_get_cong(xprt, task))
183 xprt->snd_task = task;
187 * Releases the socket for use by other requests.
190 __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
192 if (xprt->snd_task == task)
193 xprt->snd_task = NULL;
194 __xprt_lock_write_next(xprt);
198 xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
200 spin_lock_bh(&xprt->sock_lock);
201 __xprt_release_write(xprt, task);
202 spin_unlock_bh(&xprt->sock_lock);
206 * Write data to socket.
209 xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req)
211 struct socket *sock = xprt->sock;
213 struct xdr_buf *xdr = &req->rq_snd_buf;
214 struct iovec niv[MAX_IOVEC];
215 unsigned int niov, slen, skip;
222 xprt_pktdump("packet data:",
223 req->rq_svec->iov_base,
224 req->rq_svec->iov_len);
226 /* Dont repeat bytes */
227 skip = req->rq_bytes_sent;
228 slen = xdr->len - skip;
229 niov = xdr_kmap(niv, xdr, skip);
231 msg.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL;
233 msg.msg_iovlen = niov;
234 msg.msg_name = (struct sockaddr *) &xprt->addr;
235 msg.msg_namelen = sizeof(xprt->addr);
236 msg.msg_control = NULL;
237 msg.msg_controllen = 0;
239 oldfs = get_fs(); set_fs(get_ds());
240 clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
241 result = sock_sendmsg(sock, &msg, slen);
244 xdr_kunmap(xdr, skip);
246 dprintk("RPC: xprt_sendmsg(%d) = %d\n", slen, result);
253 /* When the server has died, an ICMP port unreachable message
254 * prompts ECONNREFUSED.
260 /* connection broken */
265 printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result);
271 * Van Jacobson congestion avoidance. Check if the congestion window
272 * overflowed. Put the task to sleep if this is the case.
275 __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task)
277 struct rpc_rqst *req = task->tk_rqstp;
281 dprintk("RPC: %4d xprt_cwnd_limited cong = %ld cwnd = %ld\n",
282 task->tk_pid, xprt->cong, xprt->cwnd);
283 if (RPCXPRT_CONGESTED(xprt))
286 xprt->cong += RPC_CWNDSCALE;
291 * Adjust the congestion window, and wake up the next task
292 * that has been sleeping due to congestion
295 __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
300 xprt->cong -= RPC_CWNDSCALE;
301 __xprt_lock_write_next(xprt);
305 * Adjust RPC congestion window
306 * We use a time-smoothed congestion estimator to avoid heavy oscillation.
309 xprt_adjust_cwnd(struct rpc_xprt *xprt, int result)
314 if (result >= 0 && cwnd <= xprt->cong) {
315 /* The (cwnd >> 1) term makes sure
316 * the result gets rounded properly. */
317 cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd;
318 if (cwnd > RPC_MAXCWND)
320 __xprt_lock_write_next(xprt);
321 } else if (result == -ETIMEDOUT) {
323 if (cwnd < RPC_CWNDSCALE)
324 cwnd = RPC_CWNDSCALE;
326 dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n",
327 xprt->cong, xprt->cwnd, cwnd);
332 * Adjust timeout values etc for next retransmit
335 xprt_adjust_timeout(struct rpc_timeout *to)
337 if (to->to_retries > 0) {
338 if (to->to_exponential)
339 to->to_current <<= 1;
341 to->to_current += to->to_increment;
342 if (to->to_maxval && to->to_current >= to->to_maxval)
343 to->to_current = to->to_maxval;
345 if (to->to_exponential)
346 to->to_initval <<= 1;
348 to->to_initval += to->to_increment;
349 if (to->to_maxval && to->to_initval >= to->to_maxval)
350 to->to_initval = to->to_maxval;
351 to->to_current = to->to_initval;
354 if (!to->to_current) {
355 printk(KERN_WARNING "xprt_adjust_timeout: to_current = 0!\n");
356 to->to_current = 5 * HZ;
358 pprintk("RPC: %lu %s\n", jiffies,
359 to->to_retries? "retrans" : "timeout");
360 return to->to_retries-- > 0;
364 * Close down a transport socket
367 xprt_close(struct rpc_xprt *xprt)
369 struct socket *sock = xprt->sock;
370 struct sock *sk = xprt->inet;
378 sk->user_data = NULL;
379 sk->data_ready = xprt->old_data_ready;
380 sk->state_change = xprt->old_state_change;
381 sk->write_space = xprt->old_write_space;
383 xprt_disconnect(xprt);
388 * TCP doesn't require the rpciod now - other things may
389 * but rpciod handles that not us.
396 * Mark a transport as disconnected
399 xprt_disconnect(struct rpc_xprt *xprt)
401 dprintk("RPC: disconnected transport %p\n", xprt);
402 xprt_clear_connected(xprt);
403 rpc_wake_up_status(&xprt->pending, -ENOTCONN);
407 * Reconnect a broken TCP connection.
409 * Note: This cannot collide with the TCP reads, as both run from rpciod
412 xprt_reconnect(struct rpc_task *task)
414 struct rpc_xprt *xprt = task->tk_xprt;
415 struct socket *sock = xprt->sock;
419 dprintk("RPC: %4d xprt_reconnect %p connected %d\n",
420 task->tk_pid, xprt, xprt_connected(xprt));
427 if (!xprt->addr.sin_port) {
428 task->tk_status = -EIO;
432 if (!xprt_lock_write(xprt, task))
434 if (xprt_connected(xprt))
437 if (sock && sock->state != SS_UNCONNECTED)
440 if (!(inet = xprt->inet)) {
441 /* Create an unconnected socket */
442 if (!(sock = xprt_create_socket(xprt->prot, &xprt->timeout)))
444 xprt_bind_socket(xprt, sock);
448 /* Now connect it asynchronously. */
449 dprintk("RPC: %4d connecting new socket\n", task->tk_pid);
450 status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
451 sizeof(xprt->addr), O_NONBLOCK);
465 printk("RPC: TCP connect error %d!\n", -status);
470 /* Protect against TCP socket state changes */
472 dprintk("RPC: %4d connect status %d connected %d\n",
473 task->tk_pid, status, xprt_connected(xprt));
475 if (inet->state != TCP_ESTABLISHED) {
476 task->tk_timeout = xprt->timeout.to_maxval;
477 /* if the socket is already closing, delay 5 secs */
478 if ((1<<inet->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
479 task->tk_timeout = 5*HZ;
480 rpc_sleep_on(&xprt->pending, task, xprt_reconn_status, NULL);
488 rpc_delay(task, 5*HZ);
489 task->tk_status = -ENOTCONN;
492 xprt_release_write(xprt, task);
496 * Reconnect timeout. We just mark the transport as not being in the
497 * process of reconnecting, and leave the rest to the upper layers.
500 xprt_reconn_status(struct rpc_task *task)
502 struct rpc_xprt *xprt = task->tk_xprt;
504 dprintk("RPC: %4d xprt_reconn_timeout %d\n",
505 task->tk_pid, task->tk_status);
507 xprt_release_write(xprt, task);
511 * Look up the RPC request corresponding to a reply, and then lock it.
513 static inline struct rpc_rqst *
514 xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid)
516 struct list_head *pos;
517 struct rpc_rqst *req = NULL;
519 list_for_each(pos, &xprt->recv) {
520 struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list);
521 if (entry->rq_xid == xid) {
530 * Complete reply received.
531 * The TCP code relies on us to remove the request from xprt->pending.
534 xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied)
536 struct rpc_task *task = req->rq_task;
537 struct rpc_clnt *clnt = task->tk_client;
539 /* Adjust congestion window */
541 xprt_adjust_cwnd(xprt, copied);
542 __xprt_put_cong(xprt, req);
543 if (!req->rq_nresend) {
544 int timer = rpcproc_timer(clnt, task->tk_msg.rpc_proc);
546 rpc_update_rtt(&clnt->cl_rtt, timer, (long)jiffies - req->rq_xtime);
548 rpc_clear_timeo(&clnt->cl_rtt);
552 /* Profile only reads for now */
554 static unsigned long nextstat = 0;
555 static unsigned long pkt_rtt = 0, pkt_len = 0, pkt_cnt = 0;
558 pkt_len += req->rq_slen + copied;
559 pkt_rtt += jiffies - req->rq_xtime;
560 if (time_before(nextstat, jiffies)) {
561 printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd);
562 printk("RPC: %ld %ld %ld %ld stat\n",
563 jiffies, pkt_cnt, pkt_len, pkt_rtt);
564 pkt_rtt = pkt_len = pkt_cnt = 0;
565 nextstat = jiffies + 5 * HZ;
570 dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied);
571 req->rq_received = copied;
572 list_del_init(&req->rq_list);
574 /* ... and wake up the process. */
575 rpc_wake_up_task(task);
580 skb_read_bits(skb_reader_t *desc, void *to, size_t len)
582 if (len > desc->count)
584 skb_copy_bits(desc->skb, desc->offset, to, len);
591 skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len)
593 unsigned int csum2, pos;
595 if (len > desc->count)
598 csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
599 desc->csum = csum_block_add(desc->csum, csum2, pos);
606 * We have set things up such that we perform the checksum of the UDP
607 * packet in parallel with the copies into the RPC client iovec. -DaveM
610 csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
615 desc.offset = sizeof(struct udphdr);
616 desc.count = skb->len - desc.offset;
618 if (skb->ip_summed == CHECKSUM_UNNECESSARY)
621 desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
622 xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits);
623 if (desc.offset != skb->len) {
625 csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
626 desc.csum = csum_block_add(desc.csum, csum2, desc.offset);
628 if ((unsigned short)csum_fold(desc.csum))
632 xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits);
637 * Input handler for RPC replies. Called from a bottom half and hence
641 udp_data_ready(struct sock *sk, int len)
643 struct rpc_task *task;
644 struct rpc_xprt *xprt;
645 struct rpc_rqst *rovr;
647 int err, repsize, copied;
649 dprintk("RPC: udp_data_ready...\n");
650 if (!(xprt = xprt_from_sock(sk))) {
651 printk("RPC: udp_data_ready request not found!\n");
655 dprintk("RPC: udp_data_ready client %p\n", xprt);
657 if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
663 repsize = skb->len - sizeof(struct udphdr);
665 printk("RPC: impossible RPC reply size %d!\n", repsize);
669 /* Look up and lock the request corresponding to the given XID */
670 spin_lock(&xprt->sock_lock);
671 rovr = xprt_lookup_rqst(xprt, *(u32 *) (skb->h.raw + sizeof(struct udphdr)));
674 task = rovr->rq_task;
676 dprintk("RPC: %4d received reply\n", task->tk_pid);
677 xprt_pktdump("packet data:",
678 (u32 *) (skb->h.raw+sizeof(struct udphdr)), repsize);
680 if ((copied = rovr->rq_rlen) > repsize)
683 /* Suck it into the iovec, verify checksum if not done by hw. */
684 if (csum_partial_copy_to_xdr(&rovr->rq_rcv_buf, skb))
687 /* Something worked... */
688 dst_confirm(skb->dst);
690 xprt_complete_rqst(xprt, rovr, copied);
693 spin_unlock(&xprt->sock_lock);
695 skb_free_datagram(sk, skb);
697 if (sk->sleep && waitqueue_active(sk->sleep))
698 wake_up_interruptible(sk->sleep);
702 * Copy from an skb into memory and shrink the skb.
705 tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
707 if (len > desc->count)
709 skb_copy_bits(desc->skb, desc->offset, p, len);
716 * TCP read fragment marker
719 tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
724 p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset;
725 len = sizeof(xprt->tcp_recm) - xprt->tcp_offset;
726 used = tcp_copy_data(desc, p, len);
727 xprt->tcp_offset += used;
730 xprt->tcp_reclen = ntohl(xprt->tcp_recm);
731 if (xprt->tcp_reclen & 0x80000000)
732 xprt->tcp_flags |= XPRT_LAST_FRAG;
734 xprt->tcp_flags &= ~XPRT_LAST_FRAG;
735 xprt->tcp_reclen &= 0x7fffffff;
736 xprt->tcp_flags &= ~XPRT_COPY_RECM;
737 xprt->tcp_offset = 0;
738 /* Sanity check of the record length */
739 if (xprt->tcp_reclen < 4) {
740 printk(KERN_ERR "RPC: Invalid TCP record fragment length\n");
741 xprt_disconnect(xprt);
743 dprintk("RPC: reading TCP record fragment of length %d\n",
748 tcp_check_recm(struct rpc_xprt *xprt)
750 if (xprt->tcp_offset == xprt->tcp_reclen) {
751 xprt->tcp_flags |= XPRT_COPY_RECM;
752 xprt->tcp_offset = 0;
753 if (xprt->tcp_flags & XPRT_LAST_FRAG) {
754 xprt->tcp_flags &= ~XPRT_COPY_DATA;
755 xprt->tcp_flags |= XPRT_COPY_XID;
756 xprt->tcp_copied = 0;
765 tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc)
770 len = sizeof(xprt->tcp_xid) - xprt->tcp_offset;
771 dprintk("RPC: reading XID (%Zu bytes)\n", len);
772 p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset;
773 used = tcp_copy_data(desc, p, len);
774 xprt->tcp_offset += used;
777 xprt->tcp_flags &= ~XPRT_COPY_XID;
778 xprt->tcp_flags |= XPRT_COPY_DATA;
779 xprt->tcp_copied = 4;
780 dprintk("RPC: reading reply for XID %08x\n", xprt->tcp_xid);
781 tcp_check_recm(xprt);
785 * TCP read and complete request
788 tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
790 struct rpc_rqst *req;
791 struct xdr_buf *rcvbuf;
794 /* Find and lock the request corresponding to this xid */
795 spin_lock(&xprt->sock_lock);
796 req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
798 xprt->tcp_flags &= ~XPRT_COPY_DATA;
799 dprintk("RPC: XID %08x request not found!\n",
801 spin_unlock(&xprt->sock_lock);
805 rcvbuf = &req->rq_rcv_buf;
807 if (len > xprt->tcp_reclen - xprt->tcp_offset) {
808 skb_reader_t my_desc;
810 len = xprt->tcp_reclen - xprt->tcp_offset;
811 memcpy(&my_desc, desc, sizeof(my_desc));
813 xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
814 &my_desc, tcp_copy_data);
818 xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
819 desc, tcp_copy_data);
820 xprt->tcp_copied += len;
821 xprt->tcp_offset += len;
823 if (xprt->tcp_copied == req->rq_rlen)
824 xprt->tcp_flags &= ~XPRT_COPY_DATA;
825 else if (xprt->tcp_offset == xprt->tcp_reclen) {
826 if (xprt->tcp_flags & XPRT_LAST_FRAG)
827 xprt->tcp_flags &= ~XPRT_COPY_DATA;
830 if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
831 dprintk("RPC: %4d received reply complete\n",
832 req->rq_task->tk_pid);
833 xprt_complete_rqst(xprt, req, xprt->tcp_copied);
835 spin_unlock(&xprt->sock_lock);
836 tcp_check_recm(xprt);
840 * TCP discard extra bytes from a short read
843 tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
847 len = xprt->tcp_reclen - xprt->tcp_offset;
848 if (len > desc->count)
852 xprt->tcp_offset += len;
853 tcp_check_recm(xprt);
857 * TCP record receive routine
858 * We first have to grab the record marker, then the XID, then the data.
861 tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
862 unsigned int offset, size_t len)
864 struct rpc_xprt *xprt = (struct rpc_xprt *)rd_desc->buf;
865 skb_reader_t desc = { skb, offset, len };
867 dprintk("RPC: tcp_data_recv\n");
869 /* Read in a new fragment marker if necessary */
870 /* Can we ever really expect to get completely empty fragments? */
871 if (xprt->tcp_flags & XPRT_COPY_RECM) {
872 tcp_read_fraghdr(xprt, &desc);
875 /* Read in the xid if necessary */
876 if (xprt->tcp_flags & XPRT_COPY_XID) {
877 tcp_read_xid(xprt, &desc);
880 /* Read in the request data */
881 if (xprt->tcp_flags & XPRT_COPY_DATA) {
882 tcp_read_request(xprt, &desc);
885 /* Skip over any trailing bytes on short reads */
886 tcp_read_discard(xprt, &desc);
887 } while (desc.count && xprt_connected(xprt));
888 dprintk("RPC: tcp_data_recv done\n");
889 return len - desc.count;
892 static void tcp_data_ready(struct sock *sk, int bytes)
894 struct rpc_xprt *xprt;
895 read_descriptor_t rd_desc;
897 dprintk("RPC: tcp_data_ready...\n");
898 if (!(xprt = xprt_from_sock(sk))) {
899 printk("RPC: tcp_data_ready socket info not found!\n");
905 /* We use rd_desc to pass struct xprt to tcp_data_recv */
906 rd_desc.buf = (char *)xprt;
907 rd_desc.count = 65536;
908 tcp_read_sock(sk, &rd_desc, tcp_data_recv);
912 tcp_state_change(struct sock *sk)
914 struct rpc_xprt *xprt;
916 if (!(xprt = xprt_from_sock(sk)))
918 dprintk("RPC: tcp_state_change client %p...\n", xprt);
919 dprintk("RPC: state %x conn %d dead %d zapped %d\n",
920 sk->state, xprt_connected(xprt),
921 sk->dead, sk->zapped);
924 case TCP_ESTABLISHED:
925 if (xprt_test_and_set_connected(xprt))
928 /* Reset TCP record info */
929 xprt->tcp_offset = 0;
930 xprt->tcp_reclen = 0;
931 xprt->tcp_copied = 0;
932 xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID;
934 spin_lock(&xprt->sock_lock);
935 if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending)
936 rpc_wake_up_task(xprt->snd_task);
937 spin_unlock(&xprt->sock_lock);
943 xprt_disconnect(xprt);
947 if (sk->sleep && waitqueue_active(sk->sleep))
948 wake_up_interruptible_all(sk->sleep);
952 * Called when more output buffer space is available for this socket.
953 * We try not to wake our writers until they can make "significant"
954 * progress, otherwise we'll waste resources thrashing sock_sendmsg
955 * with a bunch of small requests.
958 xprt_write_space(struct sock *sk)
960 struct rpc_xprt *xprt;
963 if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->socket))
968 /* Wait until we have enough socket memory */
970 /* from net/ipv4/tcp.c:tcp_write_space */
971 if (tcp_wspace(sk) < tcp_min_write_space(sk))
974 /* from net/core/sock.c:sock_def_write_space */
975 if (!sock_writeable(sk))
979 if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))
982 spin_lock_bh(&xprt->sock_lock);
983 if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending)
984 rpc_wake_up_task(xprt->snd_task);
985 spin_unlock_bh(&xprt->sock_lock);
986 if (sk->sleep && waitqueue_active(sk->sleep))
987 wake_up_interruptible(sk->sleep);
991 * Exponential backoff for UDP retries
994 xprt_expbackoff(struct rpc_task *task, struct rpc_rqst *req)
999 backoff = min(rpc_ntimeo(&task->tk_client->cl_rtt), XPRT_MAX_BACKOFF);
1000 if (req->rq_ntimeo < (1 << backoff))
1006 * RPC receive timeout handler.
1009 xprt_timer(struct rpc_task *task)
1011 struct rpc_rqst *req = task->tk_rqstp;
1012 struct rpc_xprt *xprt = req->rq_xprt;
1014 spin_lock(&xprt->sock_lock);
1015 if (req->rq_received)
1018 if (!xprt->nocong) {
1019 if (xprt_expbackoff(task, req)) {
1020 rpc_add_timer(task, xprt_timer);
1023 rpc_inc_timeo(&task->tk_client->cl_rtt);
1024 xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT);
1028 dprintk("RPC: %4d xprt_timer (%s request)\n",
1029 task->tk_pid, req ? "pending" : "backlogged");
1031 task->tk_status = -ETIMEDOUT;
1033 task->tk_timeout = 0;
1034 rpc_wake_up_task(task);
1036 spin_unlock(&xprt->sock_lock);
1040 * Place the actual RPC call.
1041 * We have to copy the iovec because sendmsg fiddles with its contents.
1044 xprt_transmit(struct rpc_task *task)
1046 struct rpc_rqst *req = task->tk_rqstp;
1047 struct rpc_xprt *xprt = req->rq_xprt;
1049 dprintk("RPC: %4d xprt_transmit(%x)\n", task->tk_pid,
1050 *(u32 *)(req->rq_svec[0].iov_base));
1053 task->tk_status = -EIO;
1055 if (!xprt_connected(xprt))
1056 task->tk_status = -ENOTCONN;
1058 if (task->tk_status < 0)
1061 if (task->tk_rpcwait)
1062 rpc_remove_wait_queue(task);
1064 /* set up everything as needed. */
1065 /* Write the record marker */
1067 u32 *marker = req->rq_svec[0].iov_base;
1069 *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker)));
1072 spin_lock_bh(&xprt->sock_lock);
1073 if (!__xprt_lock_write(xprt, task)) {
1074 spin_unlock_bh(&xprt->sock_lock);
1077 if (list_empty(&req->rq_list)) {
1078 list_add_tail(&req->rq_list, &xprt->recv);
1079 req->rq_received = 0;
1081 spin_unlock_bh(&xprt->sock_lock);
1083 do_xprt_transmit(task);
1087 do_xprt_transmit(struct rpc_task *task)
1089 struct rpc_clnt *clnt = task->tk_client;
1090 struct rpc_rqst *req = task->tk_rqstp;
1091 struct rpc_xprt *xprt = req->rq_xprt;
1092 int status, retry = 0;
1095 /* Continue transmitting the packet/record. We must be careful
1096 * to cope with writespace callbacks arriving _after_ we have
1097 * called xprt_sendmsg().
1100 req->rq_xtime = jiffies;
1101 status = xprt_sendmsg(xprt, req);
1107 req->rq_bytes_sent += status;
1109 if (req->rq_bytes_sent >= req->rq_slen)
1112 if (status >= req->rq_slen)
1118 dprintk("RPC: %4d xmit incomplete (%d left of %d)\n",
1119 task->tk_pid, req->rq_slen - req->rq_bytes_sent,
1127 /* Note: at this point, task->tk_sleeping has not yet been set,
1128 * hence there is no danger of the waking up task being put on
1129 * schedq, and being picked up by a parallel run of rpciod().
1131 if (req->rq_received)
1134 task->tk_status = status;
1138 if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) {
1139 /* Protect against races with xprt_write_space */
1140 spin_lock_bh(&xprt->sock_lock);
1141 if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) {
1142 task->tk_timeout = req->rq_timeout.to_current;
1143 rpc_sleep_on(&xprt->pending, task, NULL, NULL);
1145 spin_unlock_bh(&xprt->sock_lock);
1148 /* Keep holding the socket if it is blocked */
1149 rpc_delay(task, HZ>>4);
1157 xprt_disconnect(xprt);
1158 req->rq_bytes_sent = 0;
1161 xprt_release_write(xprt, task);
1164 dprintk("RPC: %4d xmit complete\n", task->tk_pid);
1165 /* Set the task's receive timeout value */
1166 if (!xprt->nocong) {
1167 task->tk_timeout = rpc_calc_rto(&clnt->cl_rtt,
1168 rpcproc_timer(clnt, task->tk_msg.rpc_proc));
1170 if (task->tk_timeout > req->rq_timeout.to_maxval)
1171 task->tk_timeout = req->rq_timeout.to_maxval;
1173 task->tk_timeout = req->rq_timeout.to_current;
1174 spin_lock_bh(&xprt->sock_lock);
1175 if (!req->rq_received)
1176 rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer);
1177 __xprt_release_write(xprt, task);
1178 spin_unlock_bh(&xprt->sock_lock);
1182 * Reserve an RPC call slot.
1185 xprt_reserve(struct rpc_task *task)
1187 struct rpc_xprt *xprt = task->tk_xprt;
1189 /* We already have an initialized request. */
1193 spin_lock(&xprt->xprt_lock);
1194 xprt_reserve_status(task);
1195 if (task->tk_rqstp) {
1196 task->tk_timeout = 0;
1197 } else if (!task->tk_timeout) {
1198 task->tk_status = -ENOBUFS;
1200 dprintk("RPC: xprt_reserve waiting on backlog\n");
1201 task->tk_status = -EAGAIN;
1202 rpc_sleep_on(&xprt->backlog, task, NULL, NULL);
1204 spin_unlock(&xprt->xprt_lock);
1205 dprintk("RPC: %4d xprt_reserve returns %d\n",
1206 task->tk_pid, task->tk_status);
1207 return task->tk_status;
1211 * Reservation callback
1214 xprt_reserve_status(struct rpc_task *task)
1216 struct rpc_xprt *xprt = task->tk_xprt;
1217 struct rpc_rqst *req;
1219 if (xprt->shutdown) {
1220 task->tk_status = -EIO;
1221 } else if (task->tk_status < 0) {
1223 } else if (task->tk_rqstp) {
1224 /* We've already been given a request slot: NOP */
1226 if (!(req = xprt->free))
1228 /* OK: There's room for us. Grab a free slot */
1229 xprt->free = req->rq_next;
1230 req->rq_next = NULL;
1231 task->tk_rqstp = req;
1232 xprt_request_init(task, xprt);
1238 task->tk_status = -EAGAIN;
1242 * Initialize RPC request
1245 xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
1247 struct rpc_rqst *req = task->tk_rqstp;
1251 xid = CURRENT_TIME << 12;
1253 dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, xid);
1254 task->tk_status = 0;
1255 req->rq_timeout = xprt->timeout;
1256 req->rq_task = task;
1257 req->rq_xprt = xprt;
1258 req->rq_xid = xid++;
1261 INIT_LIST_HEAD(&req->rq_list);
1265 * Release an RPC call slot
1268 xprt_release(struct rpc_task *task)
1270 struct rpc_xprt *xprt = task->tk_xprt;
1271 struct rpc_rqst *req;
1273 if (!(req = task->tk_rqstp))
1275 spin_lock_bh(&xprt->sock_lock);
1276 __xprt_release_write(xprt, task);
1277 __xprt_put_cong(xprt, req);
1278 if (!list_empty(&req->rq_list))
1279 list_del(&req->rq_list);
1280 spin_unlock_bh(&xprt->sock_lock);
1281 task->tk_rqstp = NULL;
1282 memset(req, 0, sizeof(*req)); /* mark unused */
1284 dprintk("RPC: %4d release request %p\n", task->tk_pid, req);
1286 spin_lock(&xprt->xprt_lock);
1287 req->rq_next = xprt->free;
1290 xprt_clear_backlog(xprt);
1291 spin_unlock(&xprt->xprt_lock);
1295 * Set default timeout parameters
1298 xprt_default_timeout(struct rpc_timeout *to, int proto)
1300 if (proto == IPPROTO_UDP)
1301 xprt_set_timeout(to, 5, 5 * HZ);
1303 xprt_set_timeout(to, 5, 60 * HZ);
1307 * Set constant timeout
1310 xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr)
1314 to->to_increment = incr;
1315 to->to_maxval = incr * retr;
1316 to->to_resrvval = incr * retr;
1317 to->to_retries = retr;
1318 to->to_exponential = 0;
1322 * Initialize an RPC client
1324 static struct rpc_xprt *
1325 xprt_setup(struct socket *sock, int proto,
1326 struct sockaddr_in *ap, struct rpc_timeout *to)
1328 struct rpc_xprt *xprt;
1329 struct rpc_rqst *req;
1332 dprintk("RPC: setting up %s transport...\n",
1333 proto == IPPROTO_UDP? "UDP" : "TCP");
1335 if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL)
1337 memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */
1341 xprt->stream = (proto == IPPROTO_TCP)? 1 : 0;
1343 xprt->cwnd = RPC_MAXCWND;
1346 xprt->cwnd = RPC_INITCWND;
1347 spin_lock_init(&xprt->sock_lock);
1348 spin_lock_init(&xprt->xprt_lock);
1349 init_waitqueue_head(&xprt->cong_wait);
1351 INIT_LIST_HEAD(&xprt->recv);
1353 /* Set timeout parameters */
1355 xprt->timeout = *to;
1356 xprt->timeout.to_current = to->to_initval;
1357 xprt->timeout.to_resrvval = to->to_maxval << 1;
1359 xprt_default_timeout(&xprt->timeout, xprt->prot);
1361 INIT_RPC_WAITQ(&xprt->pending, "xprt_pending");
1362 INIT_RPC_WAITQ(&xprt->sending, "xprt_sending");
1363 INIT_RPC_WAITQ(&xprt->resend, "xprt_resend");
1364 INIT_RPC_WAITQ(&xprt->backlog, "xprt_backlog");
1366 /* initialize free list */
1367 for (i = 0, req = xprt->slot; i < RPC_MAXREQS-1; i++, req++)
1368 req->rq_next = req + 1;
1369 req->rq_next = NULL;
1370 xprt->free = xprt->slot;
1372 dprintk("RPC: created transport %p\n", xprt);
1374 xprt_bind_socket(xprt, sock);
1379 * Bind to a reserved port
1382 xprt_bindresvport(struct socket *sock)
1384 struct sockaddr_in myaddr;
1387 memset(&myaddr, 0, sizeof(myaddr));
1388 myaddr.sin_family = AF_INET;
1391 myaddr.sin_port = htons(port);
1392 err = sock->ops->bind(sock, (struct sockaddr *) &myaddr,
1394 } while (err == -EADDRINUSE && --port > 0);
1397 printk("RPC: Can't bind to reserved port (%d).\n", -err);
1403 xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock)
1405 struct sock *sk = sock->sk;
1410 sk->user_data = xprt;
1411 xprt->old_data_ready = sk->data_ready;
1412 xprt->old_state_change = sk->state_change;
1413 xprt->old_write_space = sk->write_space;
1414 if (xprt->prot == IPPROTO_UDP) {
1415 sk->data_ready = udp_data_ready;
1416 sk->no_check = UDP_CSUM_NORCV;
1417 xprt_set_connected(xprt);
1419 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1420 tp->nonagle = 1; /* disable Nagle's algorithm */
1421 sk->data_ready = tcp_data_ready;
1422 sk->state_change = tcp_state_change;
1423 xprt_clear_connected(xprt);
1425 sk->write_space = xprt_write_space;
1427 /* Reset to new socket */
1431 * TCP requires the rpc I/O daemon is present
1440 * Set socket buffer length
1443 xprt_sock_setbufsize(struct rpc_xprt *xprt)
1445 struct sock *sk = xprt->inet;
1449 if (xprt->rcvsize) {
1450 sk->userlocks |= SOCK_RCVBUF_LOCK;
1451 sk->rcvbuf = xprt->rcvsize * RPC_MAXCONG * 2;
1453 if (xprt->sndsize) {
1454 sk->userlocks |= SOCK_SNDBUF_LOCK;
1455 sk->sndbuf = xprt->sndsize * RPC_MAXCONG * 2;
1456 sk->write_space(sk);
1461 * Create a client socket given the protocol and peer address.
1463 static struct socket *
1464 xprt_create_socket(int proto, struct rpc_timeout *to)
1466 struct socket *sock;
1469 dprintk("RPC: xprt_create_socket(%s %d)\n",
1470 (proto == IPPROTO_UDP)? "udp" : "tcp", proto);
1472 type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1474 if ((err = sock_create(PF_INET, type, proto, &sock)) < 0) {
1475 printk("RPC: can't create socket (%d).\n", -err);
1479 /* If the caller has the capability, bind to a reserved port */
1480 if (capable(CAP_NET_BIND_SERVICE) && xprt_bindresvport(sock) < 0)
1491 * Create an RPC client transport given the protocol and peer address.
1494 xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to)
1496 struct socket *sock;
1497 struct rpc_xprt *xprt;
1499 dprintk("RPC: xprt_create_proto called\n");
1501 if (!(sock = xprt_create_socket(proto, to)))
1504 if (!(xprt = xprt_setup(sock, proto, sap, to)))
1511 * Prepare for transport shutdown.
1514 xprt_shutdown(struct rpc_xprt *xprt)
1517 rpc_wake_up(&xprt->sending);
1518 rpc_wake_up(&xprt->resend);
1519 rpc_wake_up(&xprt->pending);
1520 rpc_wake_up(&xprt->backlog);
1521 if (waitqueue_active(&xprt->cong_wait))
1522 wake_up(&xprt->cong_wait);
1526 * Clear the xprt backlog queue
1529 xprt_clear_backlog(struct rpc_xprt *xprt) {
1530 rpc_wake_up_next(&xprt->backlog);
1531 if (waitqueue_active(&xprt->cong_wait))
1532 wake_up(&xprt->cong_wait);
1537 * Destroy an RPC transport, killing off all requests.
1540 xprt_destroy(struct rpc_xprt *xprt)
1542 dprintk("RPC: destroying transport %p\n", xprt);
1543 xprt_shutdown(xprt);