4 * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top
5 * of these would make sense. Not tonight however 8-).
6 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly
7 * identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it.
9 * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code)
12 * Alan Cox : NULL return from skb_peek_copy() understood
13 * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff.
14 * Alan Cox : Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but
15 * AX.25 now works right, and SPX is feasible.
16 * Alan Cox : Fixed write poll of non IP protocol crash.
17 * Florian La Roche: Changed for my new skbuff handling.
18 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
19 * Linus Torvalds : BSD semantic fixes.
20 * Alan Cox : Datagram iovec handling
21 * Darryl Miles : Fixed non-blocking SOCK_STREAM.
22 * Alan Cox : POSIXisms
23 * Pete Wyckoff : Unconnected accept() fix.
27 #include <linux/types.h>
28 #include <linux/kernel.h>
29 #include <asm/uaccess.h>
30 #include <asm/system.h>
32 #include <linux/interrupt.h>
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/inet.h>
36 #include <linux/netdevice.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/poll.h>
39 #include <linux/highmem.h>
41 #include <net/protocol.h>
42 #include <linux/skbuff.h>
44 #include <net/checksum.h>
48 * Is a socket 'connection oriented' ?
51 static inline int connection_based(struct sock *sk)
53 return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM);
61 static int wait_for_packet(struct sock * sk, int *err, long *timeo_p)
65 DECLARE_WAITQUEUE(wait, current);
67 __set_current_state(TASK_INTERRUPTIBLE);
68 add_wait_queue_exclusive(sk->sleep, &wait);
71 error = sock_error(sk);
75 if (!skb_queue_empty(&sk->receive_queue))
78 /* Socket shut down? */
79 if (sk->shutdown & RCV_SHUTDOWN)
82 /* Sequenced packets can come disconnected. If so we report the problem */
84 if(connection_based(sk) && !(sk->state==TCP_ESTABLISHED || sk->state==TCP_LISTEN))
88 if (signal_pending(current))
91 *timeo_p = schedule_timeout(*timeo_p);
94 current->state = TASK_RUNNING;
95 remove_wait_queue(sk->sleep, &wait);
99 error = sock_intr_errno(*timeo_p);
103 current->state = TASK_RUNNING;
104 remove_wait_queue(sk->sleep, &wait);
113 * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible
114 * races. This replaces identical code in packet,raw and udp, as well as the IPX
115 * AX.25 and Appletalk. It also finally fixes the long standing peek and read
116 * race for datagram sockets. If you alter this routine remember it must be
119 * This function will lock the socket if a skb is returned, so the caller
120 * needs to unlock the socket in that case (usually by calling skb_free_datagram)
122 * * It does not lock socket since today. This function is
123 * * free of race conditions. This measure should/can improve
124 * * significantly datagram socket latencies at high loads,
125 * * when data copying to user space takes lots of time.
126 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
130 * The order of the tests when we find no data waiting are specified
131 * quite explicitly by POSIX 1003.1g, don't change them without having
132 * the standard around please.
135 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err)
141 /* Caller is allowed not to check sk->err before skb_recv_datagram() */
142 error = sock_error(sk);
146 timeo = sock_rcvtimeo(sk, noblock);
149 /* Again only user level code calls this function, so nothing interrupt level
150 will suddenly eat the receive_queue.
152 Look at current nfs client by the way...
153 However, this function was corrent in any case. 8)
155 if (flags & MSG_PEEK)
157 unsigned long cpu_flags;
159 spin_lock_irqsave(&sk->receive_queue.lock, cpu_flags);
160 skb = skb_peek(&sk->receive_queue);
162 atomic_inc(&skb->users);
163 spin_unlock_irqrestore(&sk->receive_queue.lock, cpu_flags);
165 skb = skb_dequeue(&sk->receive_queue);
170 /* User doesn't want to wait */
175 } while (wait_for_packet(sk, err, &timeo) == 0);
184 void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
190 * Copy a datagram to a linear buffer.
193 int skb_copy_datagram(const struct sk_buff *skb, int offset, char *to, int size)
195 struct iovec iov = { to, size };
197 return skb_copy_datagram_iovec(skb, offset, &iov, size);
201 * Copy a datagram to an iovec.
202 * Note: the iovec is modified during the copy.
204 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to,
208 int start = skb->len - skb->data_len;
211 if ((copy = start-offset) > 0) {
214 if (memcpy_toiovec(to, skb->data + offset, copy))
216 if ((len -= copy) == 0)
221 /* Copy paged appendix. Hmm... why does this look so complicated? */
222 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
225 BUG_TRAP(start <= offset+len);
227 end = start + skb_shinfo(skb)->frags[i].size;
228 if ((copy = end-offset) > 0) {
231 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
232 struct page *page = frag->page;
237 err = memcpy_toiovec(to, vaddr + frag->page_offset +
249 if (skb_shinfo(skb)->frag_list) {
250 struct sk_buff *list;
252 for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
255 BUG_TRAP(start <= offset+len);
257 end = start + list->len;
258 if ((copy = end-offset) > 0) {
261 if (skb_copy_datagram_iovec(list, offset-start, to, copy))
263 if ((len -= copy) == 0)
277 int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int *csump)
280 int start = skb->len - skb->data_len;
284 if ((copy = start-offset) > 0) {
288 *csump = csum_and_copy_to_user(skb->data+offset, to, copy, *csump, &err);
291 if ((len -= copy) == 0)
298 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
301 BUG_TRAP(start <= offset+len);
303 end = start + skb_shinfo(skb)->frags[i].size;
304 if ((copy = end-offset) > 0) {
308 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
309 struct page *page = frag->page;
314 csum2 = csum_and_copy_to_user(vaddr + frag->page_offset +
315 offset-start, to, copy, 0, &err);
319 *csump = csum_block_add(*csump, csum2, pos);
329 if (skb_shinfo(skb)->frag_list) {
330 struct sk_buff *list;
332 for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
335 BUG_TRAP(start <= offset+len);
337 end = start + list->len;
338 if ((copy = end-offset) > 0) {
339 unsigned int csum2 = 0;
342 if (skb_copy_and_csum_datagram(list, offset-start, to, copy, &csum2))
344 *csump = csum_block_add(*csump, csum2, pos);
345 if ((len -= copy) == 0)
361 /* Copy and checkum skb to user iovec. Caller _must_ check that
362 skb will fit to this iovec.
364 Returns: 0 - success.
365 -EINVAL - checksum failure.
366 -EFAULT - fault during copy. Beware, in this case iovec can be
370 int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, int hlen, struct iovec *iov)
373 int chunk = skb->len - hlen;
375 /* Skip filled elements. Pretty silly, look at memcpy_toiovec, though 8) */
376 while (iov->iov_len == 0)
379 if (iov->iov_len < chunk) {
380 if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk+hlen, skb->csum)))
382 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
385 csum = csum_partial(skb->data, hlen, skb->csum);
386 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, chunk, &csum))
388 if ((unsigned short)csum_fold(csum))
390 iov->iov_len -= chunk;
391 iov->iov_base += chunk;
405 * Datagram poll: Again totally generic. This also handles
406 * sequenced packet sockets providing the socket receive queue
407 * is only ever holding data ready to receive.
409 * Note: when you _don't_ use this routine for this protocol,
410 * and you use a different write policy from sock_writeable()
411 * then please supply your own write_space callback.
414 unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
416 struct sock *sk = sock->sk;
419 poll_wait(file, sk->sleep, wait);
422 /* exceptional events? */
423 if (sk->err || !skb_queue_empty(&sk->error_queue))
425 if (sk->shutdown == SHUTDOWN_MASK)
429 if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN))
430 mask |= POLLIN | POLLRDNORM;
432 /* Connection-based need to check for termination and startup */
433 if (connection_based(sk)) {
434 if (sk->state==TCP_CLOSE)
436 /* connection hasn't started yet? */
437 if (sk->state == TCP_SYN_SENT)
442 if (sock_writeable(sk))
443 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
445 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);