2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.215 2001/10/31 08:17:58 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
57 * Alan Cox : Tidied tcp_data to avoid a potential
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version.
213 * Description of States:
215 * TCP_SYN_SENT sent a connection request, waiting for ack
217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake.
220 * TCP_ESTABLISHED connection established
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending
231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore)
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK)
241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending
245 * TCP_CLOSE socket is finished
248 #include <linux/config.h>
249 #include <linux/types.h>
250 #include <linux/fcntl.h>
251 #include <linux/poll.h>
252 #include <linux/init.h>
253 #include <linux/smp_lock.h>
254 #include <linux/fs.h>
255 #include <linux/random.h>
257 #include <net/icmp.h>
260 #include <asm/uaccess.h>
261 #include <asm/ioctls.h>
263 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
265 struct tcp_mib tcp_statistics[NR_CPUS*2];
267 kmem_cache_t *tcp_openreq_cachep;
268 kmem_cache_t *tcp_bucket_cachep;
269 kmem_cache_t *tcp_timewait_cachep;
271 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
273 int sysctl_tcp_default_win_scale = 0;
275 int sysctl_tcp_mem[3];
276 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
277 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
279 atomic_t tcp_memory_allocated; /* Current allocated memory. */
280 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
282 /* Pressure flag: try to collapse.
283 * Technical note: it is used by multiple contexts non atomically.
284 * All the tcp_mem_schedule() is of this nature: accounting
285 * is strict, actions are advisory and have some latency. */
286 int tcp_memory_pressure;
288 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
290 int tcp_mem_schedule(struct sock *sk, int size, int kind)
292 int amt = TCP_PAGES(size);
294 sk->forward_alloc += amt*TCP_MEM_QUANTUM;
295 atomic_add(amt, &tcp_memory_allocated);
298 if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
299 if (tcp_memory_pressure)
300 tcp_memory_pressure = 0;
304 /* Over hard limit. */
305 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
306 tcp_enter_memory_pressure();
307 goto suppress_allocation;
310 /* Under pressure. */
311 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
312 tcp_enter_memory_pressure();
315 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
318 if (sk->wmem_queued < sysctl_tcp_wmem[0])
322 if (!tcp_memory_pressure ||
323 sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
324 * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
331 tcp_moderate_sndbuf(sk);
333 /* Fail only if socket is _under_ its sndbuf.
334 * In this case we cannot block, so that we have to fail.
336 if (sk->wmem_queued+size >= sk->sndbuf)
340 /* Alas. Undo changes. */
341 sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
342 atomic_sub(amt, &tcp_memory_allocated);
346 void __tcp_mem_reclaim(struct sock *sk)
348 if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
349 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
350 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
351 if (tcp_memory_pressure &&
352 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
353 tcp_memory_pressure = 0;
357 void tcp_rfree(struct sk_buff *skb)
359 struct sock *sk = skb->sk;
361 atomic_sub(skb->truesize, &sk->rmem_alloc);
362 sk->forward_alloc += skb->truesize;
366 * LISTEN is a special case for poll..
368 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
370 return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
374 * Wait for a TCP event.
376 * Note that we don't need to lock the socket, as the upper poll layers
377 * take care of normal races (between the test and the event) and we don't
378 * go look at any of the socket buffers directly.
380 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
383 struct sock *sk = sock->sk;
384 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
386 poll_wait(file, sk->sleep, wait);
387 if (sk->state == TCP_LISTEN)
388 return tcp_listen_poll(sk, wait);
390 /* Socket is not locked. We are protected from async events
391 by poll logic and correct handling of state changes
392 made by another threads is impossible in any case.
400 * POLLHUP is certainly not done right. But poll() doesn't
401 * have a notion of HUP in just one direction, and for a
402 * socket the read side is more interesting.
404 * Some poll() documentation says that POLLHUP is incompatible
405 * with the POLLOUT/POLLWR flags, so somebody should check this
406 * all. But careful, it tends to be safer to return too many
407 * bits than too few, and you can easily break real applications
408 * if you don't tell them that something has hung up!
412 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
413 * our fs/select.c). It means that after we received EOF,
414 * poll always returns immediately, making impossible poll() on write()
415 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
416 * if and only if shutdown has been made in both directions.
417 * Actually, it is interesting to look how Solaris and DUX
418 * solve this dilemma. I would prefer, if PULLHUP were maskable,
419 * then we could set it on SND_SHUTDOWN. BTW examples given
420 * in Stevens' books assume exactly this behaviour, it explains
421 * why PULLHUP is incompatible with POLLOUT. --ANK
423 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
424 * blocking on fresh not-connected or disconnected socket. --ANK
426 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
428 if (sk->shutdown & RCV_SHUTDOWN)
429 mask |= POLLIN | POLLRDNORM;
432 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
433 /* Potential race condition. If read of tp below will
434 * escape above sk->state, we can be illegally awaken
435 * in SYN_* states. */
436 if ((tp->rcv_nxt != tp->copied_seq) &&
437 (tp->urg_seq != tp->copied_seq ||
438 tp->rcv_nxt != tp->copied_seq+1 ||
439 sk->urginline || !tp->urg_data))
440 mask |= POLLIN | POLLRDNORM;
442 if (!(sk->shutdown & SEND_SHUTDOWN)) {
443 if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
444 mask |= POLLOUT | POLLWRNORM;
445 } else { /* send SIGIO later */
446 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
447 set_bit(SOCK_NOSPACE, &sk->socket->flags);
449 /* Race breaker. If space is freed after
450 * wspace test but before the flags are set,
451 * IO signal will be lost.
453 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
454 mask |= POLLOUT | POLLWRNORM;
458 if (tp->urg_data & TCP_URG_VALID)
465 * TCP socket write_space callback.
467 void tcp_write_space(struct sock *sk)
469 struct socket *sock = sk->socket;
471 if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
472 clear_bit(SOCK_NOSPACE, &sock->flags);
474 if (sk->sleep && waitqueue_active(sk->sleep))
475 wake_up_interruptible(sk->sleep);
477 if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
478 sock_wake_async(sock, 2, POLL_OUT);
482 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
484 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
489 if (sk->state == TCP_LISTEN)
493 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
495 else if (sk->urginline || !tp->urg_data ||
496 before(tp->urg_seq,tp->copied_seq) ||
497 !before(tp->urg_seq,tp->rcv_nxt)) {
498 answ = tp->rcv_nxt - tp->copied_seq;
500 /* Subtract 1, if FIN is in queue. */
501 if (answ && !skb_queue_empty(&sk->receive_queue))
502 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
504 answ = tp->urg_seq - tp->copied_seq;
509 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
513 if (sk->state == TCP_LISTEN)
516 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
519 answ = tp->write_seq - tp->snd_una;
522 return(-ENOIOCTLCMD);
525 return put_user(answ, (int *)arg);
529 int tcp_listen_start(struct sock *sk)
531 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
532 struct tcp_listen_opt *lopt;
534 sk->max_ack_backlog = 0;
536 tp->accept_queue = tp->accept_queue_tail = NULL;
537 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
540 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
544 memset(lopt, 0, sizeof(struct tcp_listen_opt));
545 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
546 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
548 get_random_bytes(&lopt->hash_rnd, 4);
550 write_lock_bh(&tp->syn_wait_lock);
551 tp->listen_opt = lopt;
552 write_unlock_bh(&tp->syn_wait_lock);
554 /* There is race window here: we announce ourselves listening,
555 * but this transition is still not validated by get_port().
556 * It is OK, because this socket enters to hash table only
557 * after validation is complete.
559 sk->state = TCP_LISTEN;
560 if (sk->prot->get_port(sk, sk->num) == 0) {
561 sk->sport = htons(sk->num);
569 sk->state = TCP_CLOSE;
570 write_lock_bh(&tp->syn_wait_lock);
571 tp->listen_opt = NULL;
572 write_unlock_bh(&tp->syn_wait_lock);
578 * This routine closes sockets which have been at least partially
579 * opened, but not yet accepted.
582 static void tcp_listen_stop (struct sock *sk)
584 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
585 struct tcp_listen_opt *lopt = tp->listen_opt;
586 struct open_request *acc_req = tp->accept_queue;
587 struct open_request *req;
590 tcp_delete_keepalive_timer(sk);
592 /* make all the listen_opt local to us */
593 write_lock_bh(&tp->syn_wait_lock);
594 tp->listen_opt =NULL;
595 write_unlock_bh(&tp->syn_wait_lock);
596 tp->accept_queue = tp->accept_queue_tail = NULL;
599 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
600 while ((req = lopt->syn_table[i]) != NULL) {
601 lopt->syn_table[i] = req->dl_next;
603 tcp_openreq_free(req);
605 /* Following specs, it would be better either to send FIN
606 * (and enter FIN-WAIT-1, it is normal close)
607 * or to send active reset (abort).
608 * Certainly, it is pretty dangerous while synflood, but it is
609 * bad justification for our negligence 8)
610 * To be honest, we are not able to make either
611 * of the variants now. --ANK
616 BUG_TRAP(lopt->qlen == 0);
620 while ((req=acc_req) != NULL) {
621 struct sock *child = req->sk;
623 acc_req = req->dl_next;
627 BUG_TRAP(child->lock.users==0);
630 tcp_disconnect(child, O_NONBLOCK);
634 atomic_inc(&tcp_orphan_count);
636 tcp_destroy_sock(child);
638 bh_unlock_sock(child);
642 tcp_acceptq_removed(sk);
643 tcp_openreq_fastfree(req);
645 BUG_TRAP(sk->ack_backlog == 0);
649 * Wait for a socket to get into the connected state
651 * Note: Must be called with the socket locked.
653 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
655 struct task_struct *tsk = current;
656 DECLARE_WAITQUEUE(wait, tsk);
658 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
660 return sock_error(sk);
661 if((1 << sk->state) &
662 ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
666 if(signal_pending(tsk))
667 return sock_intr_errno(*timeo_p);
669 __set_task_state(tsk, TASK_INTERRUPTIBLE);
670 add_wait_queue(sk->sleep, &wait);
671 sk->tp_pinfo.af_tcp.write_pending++;
674 *timeo_p = schedule_timeout(*timeo_p);
677 __set_task_state(tsk, TASK_RUNNING);
678 remove_wait_queue(sk->sleep, &wait);
679 sk->tp_pinfo.af_tcp.write_pending--;
684 static inline int tcp_memory_free(struct sock *sk)
686 return sk->wmem_queued < sk->sndbuf;
690 * Wait for more memory for a socket
692 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
696 long current_timeo = *timeo;
697 DECLARE_WAITQUEUE(wait, current);
699 if (tcp_memory_free(sk))
700 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
702 add_wait_queue(sk->sleep, &wait);
704 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
706 set_current_state(TASK_INTERRUPTIBLE);
708 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
712 if (signal_pending(current))
714 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
715 if (tcp_memory_free(sk) && !vm_wait)
718 set_bit(SOCK_NOSPACE, &sk->socket->flags);
719 sk->tp_pinfo.af_tcp.write_pending++;
721 if (!tcp_memory_free(sk) || vm_wait)
722 current_timeo = schedule_timeout(current_timeo);
724 sk->tp_pinfo.af_tcp.write_pending--;
727 vm_wait -= current_timeo;
728 current_timeo = *timeo;
729 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
730 (current_timeo -= vm_wait) < 0)
734 *timeo = current_timeo;
737 current->state = TASK_RUNNING;
738 remove_wait_queue(sk->sleep, &wait);
748 err = sock_intr_errno(*timeo);
752 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
755 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
758 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
759 return page == frag->page &&
760 off == frag->page_offset+frag->size;
766 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
768 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
770 frag->page_offset = off;
772 skb_shinfo(skb)->nr_frags = i+1;
775 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
777 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
778 tp->pushed_seq = tp->write_seq;
781 static inline int forced_push(struct tcp_opt *tp)
783 return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
787 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
790 TCP_SKB_CB(skb)->seq = tp->write_seq;
791 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
792 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
793 TCP_SKB_CB(skb)->sacked = 0;
794 __skb_queue_tail(&sk->write_queue, skb);
795 tcp_charge_skb(sk, skb);
796 if (tp->send_head == NULL)
801 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
803 if (flags & MSG_OOB) {
805 tp->snd_up = tp->write_seq;
806 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
811 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
814 struct sk_buff *skb = sk->write_queue.prev;
815 if (!(flags&MSG_MORE) || forced_push(tp))
816 tcp_mark_push(tp, skb);
817 tcp_mark_urg(tp, flags, skb);
818 __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
822 static int tcp_error(struct sock *sk, int flags, int err)
825 err = sock_error(sk) ? : -EPIPE;
826 if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
827 send_sig(SIGPIPE, current, 0);
831 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
833 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
837 long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
839 /* Wait for a connection to finish. */
840 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
841 if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
844 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
846 mss_now = tcp_current_mss(sk);
850 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
854 struct sk_buff *skb = sk->write_queue.prev;
855 int offset, size, copy, i;
858 page = pages[poffset/PAGE_SIZE];
859 offset = poffset % PAGE_SIZE;
860 size = min_t(size_t, psize, PAGE_SIZE-offset);
862 if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
864 if (!tcp_memory_free(sk))
865 goto wait_for_sndbuf;
867 skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
869 goto wait_for_memory;
871 skb_entail(sk, tp, skb);
878 i = skb_shinfo(skb)->nr_frags;
879 if (can_coalesce(skb, i, page, offset)) {
880 skb_shinfo(skb)->frags[i-1].size += copy;
881 } else if (i < MAX_SKB_FRAGS) {
883 fill_page_desc(skb, i, page, offset, copy);
885 tcp_mark_push(tp, skb);
890 skb->data_len += copy;
891 skb->ip_summed = CHECKSUM_HW;
892 tp->write_seq += copy;
893 TCP_SKB_CB(skb)->end_seq += copy;
896 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
900 if (!(psize -= copy))
903 if (skb->len != mss_now || (flags&MSG_OOB))
906 if (forced_push(tp)) {
907 tcp_mark_push(tp, skb);
908 __tcp_push_pending_frames(sk, tp, mss_now, 1);
909 } else if (skb == tp->send_head)
910 tcp_push_one(sk, mss_now);
914 set_bit(SOCK_NOSPACE, &sk->socket->flags);
917 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
919 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
922 mss_now = tcp_current_mss(sk);
927 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
934 return tcp_error(sk, flags, err);
937 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
940 struct sock *sk = sock->sk;
942 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
944 if (!(sk->route_caps & NETIF_F_SG) ||
945 !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
946 return sock_no_sendpage(sock, page, offset, size, flags);
948 #undef TCP_ZC_CSUM_FLAGS
952 res = do_tcp_sendpages(sk, &page, offset, size, flags);
958 #define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page)
959 #define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off)
962 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
963 struct page *page, int off, int copy)
968 csum = csum_and_copy_from_user(from, page_address(page)+off,
971 if (skb->ip_summed == CHECKSUM_NONE)
972 skb->csum = csum_block_add(skb->csum, csum, skb->len);
974 skb->data_len += copy;
975 skb->truesize += copy;
976 sk->wmem_queued += copy;
977 sk->forward_alloc -= copy;
983 skb_add_data(struct sk_buff *skb, char *from, int copy)
989 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
992 skb->csum = csum_block_add(skb->csum, csum, off);
996 __skb_trim(skb, off);
1000 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1002 int tmp = tp->mss_cache;
1004 if (sk->route_caps&NETIF_F_SG) {
1005 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1007 if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1013 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1017 struct sk_buff *skb;
1023 tp = &(sk->tp_pinfo.af_tcp);
1026 TCP_CHECK_TIMER(sk);
1028 flags = msg->msg_flags;
1029 timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1031 /* Wait for a connection to finish. */
1032 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1033 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1036 /* This should be in poll */
1037 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1039 mss_now = tcp_current_mss(sk);
1041 /* Ok commence sending. */
1042 iovlen = msg->msg_iovlen;
1047 if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1050 while (--iovlen >= 0) {
1051 int seglen=iov->iov_len;
1052 unsigned char * from=iov->iov_base;
1056 while (seglen > 0) {
1059 skb = sk->write_queue.prev;
1061 if (tp->send_head == NULL ||
1062 (copy = mss_now - skb->len) <= 0) {
1065 /* Allocate new segment. If the interface is SG,
1066 * allocate skb fitting to single page.
1068 if (!tcp_memory_free(sk))
1069 goto wait_for_sndbuf;
1071 skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1073 goto wait_for_memory;
1075 skb_entail(sk, tp, skb);
1079 /* Try to append data to the end of skb. */
1083 /* Where to copy to? */
1084 if (skb_tailroom(skb) > 0) {
1085 /* We have some space in skb head. Superb! */
1086 if (copy > skb_tailroom(skb))
1087 copy = skb_tailroom(skb);
1088 if ((err = skb_add_data(skb, from, copy)) != 0)
1092 int i = skb_shinfo(skb)->nr_frags;
1093 struct page *page = TCP_PAGE(sk);
1094 int off = TCP_OFF(sk);
1096 if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1097 /* We can extend the last page fragment. */
1099 } else if (i == MAX_SKB_FRAGS ||
1100 (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1101 /* Need to add new fragment and cannot
1102 * do this because interface is non-SG,
1103 * or because all the page slots are busy.
1105 tcp_mark_push(tp, skb);
1108 /* If page is cached, align
1109 * offset to L1 cache boundary
1111 off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1112 if (off == PAGE_SIZE) {
1114 TCP_PAGE(sk) = page = NULL;
1119 /* Allocate new cache page. */
1120 if (!(page=tcp_alloc_page(sk)))
1121 goto wait_for_memory;
1125 if (copy > PAGE_SIZE-off)
1126 copy = PAGE_SIZE-off;
1128 /* Time to copy data. We are close to the end! */
1129 err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1131 /* If this page was new, give it to the
1132 * socket so it does not get leaked.
1134 if (TCP_PAGE(sk) == NULL) {
1135 TCP_PAGE(sk) = page;
1141 /* Update the skb. */
1143 skb_shinfo(skb)->frags[i-1].size += copy;
1145 fill_page_desc(skb, i, page, off, copy);
1148 } else if (off + copy < PAGE_SIZE) {
1150 TCP_PAGE(sk) = page;
1154 TCP_OFF(sk) = off+copy;
1158 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1160 tp->write_seq += copy;
1161 TCP_SKB_CB(skb)->end_seq += copy;
1165 if ((seglen -= copy) == 0 && iovlen == 0)
1168 if (skb->len != mss_now || (flags&MSG_OOB))
1171 if (forced_push(tp)) {
1172 tcp_mark_push(tp, skb);
1173 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1174 } else if (skb == tp->send_head)
1175 tcp_push_one(sk, mss_now);
1179 set_bit(SOCK_NOSPACE, &sk->socket->flags);
1182 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1184 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1187 mss_now = tcp_current_mss(sk);
1193 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1194 TCP_CHECK_TIMER(sk);
1199 if (skb->len == 0) {
1200 if (tp->send_head == skb)
1201 tp->send_head = NULL;
1202 __skb_unlink(skb, skb->list);
1203 tcp_free_skb(sk, skb);
1210 err = tcp_error(sk, flags, err);
1211 TCP_CHECK_TIMER(sk);
1217 * Handle reading urgent data. BSD has very simple semantics for
1218 * this, no blocking and very strange errors 8)
1221 static int tcp_recv_urg(struct sock * sk, long timeo,
1222 struct msghdr *msg, int len, int flags,
1225 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1227 /* No URG data to read. */
1228 if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1229 return -EINVAL; /* Yes this is right ! */
1231 if (sk->state==TCP_CLOSE && !sk->done)
1234 if (tp->urg_data & TCP_URG_VALID) {
1236 char c = tp->urg_data;
1238 if (!(flags & MSG_PEEK))
1239 tp->urg_data = TCP_URG_READ;
1241 /* Read urgent data. */
1242 msg->msg_flags|=MSG_OOB;
1245 if (!(flags & MSG_TRUNC))
1246 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1249 msg->msg_flags|=MSG_TRUNC;
1251 return err ? -EFAULT : len;
1254 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1257 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1258 * the available implementations agree in this case:
1259 * this call should never block, independent of the
1260 * blocking state of the socket.
1261 * Mike <pall@rz.uni-karlsruhe.de>
1267 * Release a skb if it is no longer needed. This routine
1268 * must be called with interrupts disabled or with the
1269 * socket locked so that the sk_buff queue operation is ok.
1272 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1274 __skb_unlink(skb, &sk->receive_queue);
1278 /* Clean up the receive buffer for full frames taken by the user,
1279 * then send an ACK if necessary. COPIED is the number of bytes
1280 * tcp_recvmsg has given to the user so far, it speeds up the
1281 * calculation of whether or not we must ACK for the sake of
1284 static void cleanup_rbuf(struct sock *sk, int copied)
1286 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1287 int time_to_ack = 0;
1290 struct sk_buff *skb = skb_peek(&sk->receive_queue);
1292 BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1295 if (tcp_ack_scheduled(tp)) {
1296 /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1298 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1299 || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1301 * If this read emptied read buffer, we send ACK, if
1302 * connection is not bidirectional, user drained
1303 * receive buffer and there was a small segment
1307 (tp->ack.pending&TCP_ACK_PUSHED) &&
1308 !tp->ack.pingpong &&
1309 atomic_read(&sk->rmem_alloc) == 0)) {
1314 /* We send an ACK if we can now advertise a non-zero window
1315 * which has been raised "significantly".
1317 * Even if window raised up to infinity, do not send window open ACK
1318 * in states, where we will not receive more. It is useless.
1320 if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1321 __u32 rcv_window_now = tcp_receive_window(tp);
1323 /* Optimize, __tcp_select_window() is not cheap. */
1324 if (2*rcv_window_now <= tp->window_clamp) {
1325 __u32 new_window = __tcp_select_window(sk);
1327 /* Send ACK now, if this read freed lots of space
1328 * in our buffer. Certainly, new_window is new window.
1329 * We can advertise it now, if it is not less than current one.
1330 * "Lots" means "at least twice" here.
1332 if(new_window && new_window >= 2*rcv_window_now)
1340 /* Now socket state including sk->err is changed only under lock,
1341 * hence we may omit checks after joining wait queue.
1342 * We check receive queue before schedule() only as optimization;
1343 * it is very likely that release_sock() added new data.
1346 static long tcp_data_wait(struct sock *sk, long timeo)
1348 DECLARE_WAITQUEUE(wait, current);
1350 add_wait_queue(sk->sleep, &wait);
1352 __set_current_state(TASK_INTERRUPTIBLE);
1354 set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1357 if (skb_queue_empty(&sk->receive_queue))
1358 timeo = schedule_timeout(timeo);
1361 clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1363 remove_wait_queue(sk->sleep, &wait);
1364 __set_current_state(TASK_RUNNING);
1368 static void tcp_prequeue_process(struct sock *sk)
1370 struct sk_buff *skb;
1371 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1373 net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1375 /* RX process wants to run with disabled BHs, though it is not necessary */
1377 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1378 sk->backlog_rcv(sk, skb);
1381 /* Clear memory counter. */
1382 tp->ucopy.memory = 0;
1386 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1388 struct sk_buff *skb;
1391 skb_queue_walk(&sk->receive_queue, skb) {
1392 offset = seq - TCP_SKB_CB(skb)->seq;
1395 if (offset < skb->len || skb->h.th->fin) {
1404 * This routine provides an alternative to tcp_recvmsg() for routines
1405 * that would like to handle copying from skbuffs directly in 'sendfile'
1408 * - It is assumed that the socket was locked by the caller.
1409 * - The routine does not block.
1410 * - At present, there is no support for reading OOB data
1411 * or for 'peeking' the socket using this routine
1412 * (although both would be easy to implement).
1414 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1415 sk_read_actor_t recv_actor)
1417 struct sk_buff *skb;
1418 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1419 u32 seq = tp->copied_seq;
1423 if (sk->state == TCP_LISTEN)
1425 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1426 if (offset < skb->len) {
1429 len = skb->len - offset;
1430 /* Stop reading if we hit a patch of urgent data */
1432 u32 urg_offset = tp->urg_seq - seq;
1433 if (urg_offset < len)
1438 used = recv_actor(desc, skb, offset, len);
1444 if (offset != skb->len)
1447 if (skb->h.th->fin) {
1448 tcp_eat_skb(sk, skb);
1452 tcp_eat_skb(sk, skb);
1456 tp->copied_seq = seq;
1458 tcp_rcv_space_adjust(sk);
1460 /* Clean up data we have read: This will do ACK frames. */
1462 cleanup_rbuf(sk, copied);
1467 * This routine copies from a sock struct into the user buffer.
1469 * Technical note: in 2.3 we work on _locked_ socket, so that
1470 * tricks with *seq access order and skb->users are not required.
1471 * Probably, code can be easily improved even more.
1474 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1475 int len, int nonblock, int flags, int *addr_len)
1477 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1483 int target; /* Read at least this many bytes */
1485 struct task_struct *user_recv = NULL;
1489 TCP_CHECK_TIMER(sk);
1492 if (sk->state == TCP_LISTEN)
1495 timeo = sock_rcvtimeo(sk, nonblock);
1497 /* Urgent data needs to be handled specially. */
1498 if (flags & MSG_OOB)
1501 seq = &tp->copied_seq;
1502 if (flags & MSG_PEEK) {
1503 peek_seq = tp->copied_seq;
1507 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1510 struct sk_buff * skb;
1513 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1514 if (tp->urg_data && tp->urg_seq == *seq) {
1517 if (signal_pending(current)) {
1518 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1523 /* Next get a buffer. */
1525 skb = skb_peek(&sk->receive_queue);
1530 /* Now that we have two receive queues this
1533 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1534 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1535 *seq, TCP_SKB_CB(skb)->seq);
1538 offset = *seq - TCP_SKB_CB(skb)->seq;
1541 if (offset < skb->len)
1545 BUG_TRAP(flags&MSG_PEEK);
1547 } while (skb != (struct sk_buff *)&sk->receive_queue);
1549 /* Well, if we have backlog, try to process it now yet. */
1551 if (copied >= target && sk->backlog.tail == NULL)
1556 sk->state == TCP_CLOSE ||
1557 (sk->shutdown & RCV_SHUTDOWN) ||
1559 signal_pending(current) ||
1567 copied = sock_error(sk);
1571 if (sk->shutdown & RCV_SHUTDOWN)
1574 if (sk->state == TCP_CLOSE) {
1576 /* This occurs when user tries to read
1577 * from never connected socket.
1590 if (signal_pending(current)) {
1591 copied = sock_intr_errno(timeo);
1596 cleanup_rbuf(sk, copied);
1598 if (tp->ucopy.task == user_recv) {
1599 /* Install new reader */
1600 if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1601 user_recv = current;
1602 tp->ucopy.task = user_recv;
1603 tp->ucopy.iov = msg->msg_iov;
1606 tp->ucopy.len = len;
1608 BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1610 /* Ugly... If prequeue is not empty, we have to
1611 * process it before releasing socket, otherwise
1612 * order will be broken at second iteration.
1613 * More elegant solution is required!!!
1615 * Look: we have the following (pseudo)queues:
1617 * 1. packets in flight
1622 * Each queue can be processed only if the next ones
1623 * are empty. At this point we have empty receive_queue.
1624 * But prequeue _can_ be not empty after second iteration,
1625 * when we jumped to start of loop because backlog
1626 * processing added something to receive_queue.
1627 * We cannot release_sock(), because backlog contains
1628 * packets arrived _after_ prequeued ones.
1630 * Shortly, algorithm is clear --- to process all
1631 * the queues in order. We could make it more directly,
1632 * requeueing packets from backlog to prequeue, if
1633 * is not empty. It is more elegant, but eats cycles,
1636 if (skb_queue_len(&tp->ucopy.prequeue))
1639 /* __ Set realtime policy in scheduler __ */
1642 if (copied >= target) {
1643 /* Do not sleep, just process backlog. */
1647 timeo = tcp_data_wait(sk, timeo);
1653 /* __ Restore normal policy in scheduler __ */
1655 if ((chunk = len - tp->ucopy.len) != 0) {
1656 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1661 if (tp->rcv_nxt == tp->copied_seq &&
1662 skb_queue_len(&tp->ucopy.prequeue)) {
1664 tcp_prequeue_process(sk);
1666 if ((chunk = len - tp->ucopy.len) != 0) {
1667 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1673 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1674 if (net_ratelimit())
1675 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1676 current->comm, current->pid);
1677 peek_seq = tp->copied_seq;
1682 /* Ok so how much can we use? */
1683 used = skb->len - offset;
1687 /* Do we have urgent data here? */
1689 u32 urg_offset = tp->urg_seq - *seq;
1690 if (urg_offset < used) {
1692 if (!sk->urginline) {
1704 if (!(flags&MSG_TRUNC)) {
1705 err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1707 /* Exception. Bailout! */
1718 tcp_rcv_space_adjust(sk);
1721 if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1723 tcp_fast_path_check(sk, tp);
1725 if (used + offset < skb->len)
1730 if (!(flags & MSG_PEEK))
1731 tcp_eat_skb(sk, skb);
1735 /* Process the FIN. */
1737 if (!(flags & MSG_PEEK))
1738 tcp_eat_skb(sk, skb);
1743 if (skb_queue_len(&tp->ucopy.prequeue)) {
1746 tp->ucopy.len = copied > 0 ? len : 0;
1748 tcp_prequeue_process(sk);
1750 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1751 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1757 tp->ucopy.task = NULL;
1761 /* According to UNIX98, msg_name/msg_namelen are ignored
1762 * on connected socket. I was just happy when found this 8) --ANK
1765 /* Clean up data we have read: This will do ACK frames. */
1766 cleanup_rbuf(sk, copied);
1768 TCP_CHECK_TIMER(sk);
1773 TCP_CHECK_TIMER(sk);
1778 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1783 * State processing on a close. This implements the state shift for
1784 * sending our FIN frame. Note that we only send a FIN for some
1785 * states. A shutdown() may have already sent the FIN, or we may be
1789 static unsigned char new_state[16] = {
1790 /* current state: new state: action: */
1791 /* (Invalid) */ TCP_CLOSE,
1792 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1793 /* TCP_SYN_SENT */ TCP_CLOSE,
1794 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1795 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1796 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1797 /* TCP_TIME_WAIT */ TCP_CLOSE,
1798 /* TCP_CLOSE */ TCP_CLOSE,
1799 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1800 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1801 /* TCP_LISTEN */ TCP_CLOSE,
1802 /* TCP_CLOSING */ TCP_CLOSING,
1805 static int tcp_close_state(struct sock *sk)
1807 int next = (int) new_state[sk->state];
1808 int ns = (next & TCP_STATE_MASK);
1810 tcp_set_state(sk, ns);
1812 return (next & TCP_ACTION_FIN);
1816 * Shutdown the sending side of a connection. Much like close except
1817 * that we don't receive shut down or set sk->dead.
1820 void tcp_shutdown(struct sock *sk, int how)
1822 /* We need to grab some memory, and put together a FIN,
1823 * and then put it into the queue to be sent.
1824 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1826 if (!(how & SEND_SHUTDOWN))
1829 /* If we've already sent a FIN, or it's a closed state, skip this. */
1830 if ((1 << sk->state) &
1831 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1832 /* Clear out any half completed packets. FIN if needed. */
1833 if (tcp_close_state(sk))
1840 * Return 1 if we still have things to send in our buffers.
1843 static inline int closing(struct sock * sk)
1845 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1848 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1850 /* First the read buffer. */
1851 __skb_queue_purge(&sk->receive_queue);
1853 /* Next, the error queue. */
1854 __skb_queue_purge(&sk->error_queue);
1856 /* Next, the write queue. */
1857 BUG_TRAP(skb_queue_empty(&sk->write_queue));
1859 /* Account for returned memory. */
1860 tcp_mem_reclaim(sk);
1862 BUG_TRAP(sk->wmem_queued == 0);
1863 BUG_TRAP(sk->forward_alloc == 0);
1865 /* It is _impossible_ for the backlog to contain anything
1866 * when we get here. All user references to this socket
1867 * have gone away, only the net layer knows can touch it.
1872 * At this point, there should be no process reference to this
1873 * socket, and thus no user references at all. Therefore we
1874 * can assume the socket waitqueue is inactive and nobody will
1875 * try to jump onto it.
1877 void tcp_destroy_sock(struct sock *sk)
1879 BUG_TRAP(sk->state==TCP_CLOSE);
1882 /* It cannot be in hash table! */
1883 BUG_TRAP(sk->pprev==NULL);
1885 /* If it has not 0 sk->num, it must be bound */
1886 BUG_TRAP(!sk->num || sk->prev!=NULL);
1890 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1896 sk->prot->destroy(sk);
1898 tcp_kill_sk_queues(sk);
1900 #ifdef INET_REFCNT_DEBUG
1901 if (atomic_read(&sk->refcnt) != 1) {
1902 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1906 atomic_dec(&tcp_orphan_count);
1910 void tcp_close(struct sock *sk, long timeout)
1912 struct sk_buff *skb;
1913 int data_was_unread = 0;
1916 sk->shutdown = SHUTDOWN_MASK;
1918 if(sk->state == TCP_LISTEN) {
1919 tcp_set_state(sk, TCP_CLOSE);
1922 tcp_listen_stop(sk);
1924 goto adjudge_to_death;
1927 /* We need to flush the recv. buffs. We do this only on the
1928 * descriptor close, not protocol-sourced closes, because the
1929 * reader process may not have drained the data yet!
1931 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1932 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1933 data_was_unread += len;
1937 tcp_mem_reclaim(sk);
1939 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1940 * 3.10, we send a RST here because data was lost. To
1941 * witness the awful effects of the old behavior of always
1942 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1943 * a bulk GET in an FTP client, suspend the process, wait
1944 * for the client to advertise a zero window, then kill -9
1945 * the FTP client, wheee... Note: timeout is always zero
1948 if(data_was_unread != 0) {
1949 /* Unread data was tossed, zap the connection. */
1950 NET_INC_STATS_USER(TCPAbortOnClose);
1951 tcp_set_state(sk, TCP_CLOSE);
1952 tcp_send_active_reset(sk, GFP_KERNEL);
1953 } else if (sk->linger && sk->lingertime==0) {
1954 /* Check zero linger _after_ checking for unread data. */
1955 sk->prot->disconnect(sk, 0);
1956 NET_INC_STATS_USER(TCPAbortOnData);
1957 } else if (tcp_close_state(sk)) {
1958 /* We FIN if the application ate all the data before
1959 * zapping the connection.
1962 /* RED-PEN. Formally speaking, we have broken TCP state
1963 * machine. State transitions:
1965 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1966 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1967 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1969 * are legal only when FIN has been sent (i.e. in window),
1970 * rather than queued out of window. Purists blame.
1972 * F.e. "RFC state" is ESTABLISHED,
1973 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1975 * The visible declinations are that sometimes
1976 * we enter time-wait state, when it is not required really
1977 * (harmless), do not send active resets, when they are
1978 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1979 * they look as CLOSING or LAST_ACK for Linux)
1980 * Probably, I missed some more holelets.
1987 struct task_struct *tsk = current;
1988 DECLARE_WAITQUEUE(wait, current);
1990 add_wait_queue(sk->sleep, &wait);
1993 set_current_state(TASK_INTERRUPTIBLE);
1997 timeout = schedule_timeout(timeout);
1999 } while (!signal_pending(tsk) && timeout);
2001 tsk->state = TASK_RUNNING;
2002 remove_wait_queue(sk->sleep, &wait);
2006 /* It is the last release_sock in its life. It will remove backlog. */
2010 /* Now socket is owned by kernel and we acquire BH lock
2011 to finish close. No need to check for user refs.
2015 BUG_TRAP(sk->lock.users==0);
2020 /* This is a (useful) BSD violating of the RFC. There is a
2021 * problem with TCP as specified in that the other end could
2022 * keep a socket open forever with no application left this end.
2023 * We use a 3 minute timeout (about the same as BSD) then kill
2024 * our end. If they send after that then tough - BUT: long enough
2025 * that we won't make the old 4*rto = almost no time - whoops
2028 * Nope, it was not mistake. It is really desired behaviour
2029 * f.e. on http servers, when such sockets are useless, but
2030 * consume significant resources. Let's do it with special
2031 * linger2 option. --ANK
2034 if (sk->state == TCP_FIN_WAIT2) {
2035 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2036 if (tp->linger2 < 0) {
2037 tcp_set_state(sk, TCP_CLOSE);
2038 tcp_send_active_reset(sk, GFP_ATOMIC);
2039 NET_INC_STATS_BH(TCPAbortOnLinger);
2041 int tmo = tcp_fin_time(tp);
2043 if (tmo > TCP_TIMEWAIT_LEN) {
2044 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2046 atomic_inc(&tcp_orphan_count);
2047 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2052 if (sk->state != TCP_CLOSE) {
2053 tcp_mem_reclaim(sk);
2054 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2055 (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2056 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2057 if (net_ratelimit())
2058 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2059 tcp_set_state(sk, TCP_CLOSE);
2060 tcp_send_active_reset(sk, GFP_ATOMIC);
2061 NET_INC_STATS_BH(TCPAbortOnMemory);
2064 atomic_inc(&tcp_orphan_count);
2066 if (sk->state == TCP_CLOSE)
2067 tcp_destroy_sock(sk);
2068 /* Otherwise, socket is reprieved until protocol close. */
2076 /* These states need RST on ABORT according to RFC793 */
2078 static inline int tcp_need_reset(int state)
2080 return ((1 << state) &
2081 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2082 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2085 int tcp_disconnect(struct sock *sk, int flags)
2087 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2091 old_state = sk->state;
2092 if (old_state != TCP_CLOSE)
2093 tcp_set_state(sk, TCP_CLOSE);
2095 /* ABORT function of RFC793 */
2096 if (old_state == TCP_LISTEN) {
2097 tcp_listen_stop(sk);
2098 } else if (tcp_need_reset(old_state) ||
2099 (tp->snd_nxt != tp->write_seq &&
2100 (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2101 /* The last check adjusts for discrepance of Linux wrt. RFC
2104 tcp_send_active_reset(sk, gfp_any());
2105 sk->err = ECONNRESET;
2106 } else if (old_state == TCP_SYN_SENT)
2107 sk->err = ECONNRESET;
2109 tcp_clear_xmit_timers(sk);
2110 __skb_queue_purge(&sk->receive_queue);
2111 tcp_writequeue_purge(sk);
2112 __skb_queue_purge(&tp->out_of_order_queue);
2116 if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2119 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2120 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2121 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2128 if ((tp->write_seq += tp->max_window+2) == 0)
2133 tp->packets_out = 0;
2134 tp->snd_ssthresh = 0x7fffffff;
2135 tp->snd_cwnd_cnt = 0;
2136 tcp_set_ca_state(tp, TCP_CA_Open);
2137 tcp_clear_retrans(tp);
2138 tcp_delack_init(tp);
2139 tp->send_head = NULL;
2144 BUG_TRAP(!sk->num || sk->prev);
2146 sk->error_report(sk);
2151 * Wait for an incoming connection, avoid race
2152 * conditions. This must be called with the socket locked.
2154 static int wait_for_connect(struct sock * sk, long timeo)
2156 DECLARE_WAITQUEUE(wait, current);
2160 * True wake-one mechanism for incoming connections: only
2161 * one process gets woken up, not the 'whole herd'.
2162 * Since we do not 'race & poll' for established sockets
2163 * anymore, the common case will execute the loop only once.
2165 * Subtle issue: "add_wait_queue_exclusive()" will be added
2166 * after any current non-exclusive waiters, and we know that
2167 * it will always _stay_ after any new non-exclusive waiters
2168 * because all non-exclusive waiters are added at the
2169 * beginning of the wait-queue. As such, it's ok to "drop"
2170 * our exclusiveness temporarily when we get woken up without
2171 * having to remove and re-insert us on the wait queue.
2173 add_wait_queue_exclusive(sk->sleep, &wait);
2175 current->state = TASK_INTERRUPTIBLE;
2177 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2178 timeo = schedule_timeout(timeo);
2181 if (sk->tp_pinfo.af_tcp.accept_queue)
2184 if (sk->state != TCP_LISTEN)
2186 err = sock_intr_errno(timeo);
2187 if (signal_pending(current))
2193 current->state = TASK_RUNNING;
2194 remove_wait_queue(sk->sleep, &wait);
2199 * This will accept the next outstanding connection.
2202 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2204 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2205 struct open_request *req;
2211 /* We need to make sure that this socket is listening,
2212 * and that it has something pending.
2215 if (sk->state != TCP_LISTEN)
2218 /* Find already established connection */
2219 if (!tp->accept_queue) {
2220 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2222 /* If this is a non blocking socket don't sleep */
2227 error = wait_for_connect(sk, timeo);
2232 req = tp->accept_queue;
2233 if ((tp->accept_queue = req->dl_next) == NULL)
2234 tp->accept_queue_tail = NULL;
2237 tcp_acceptq_removed(sk);
2238 tcp_openreq_fastfree(req);
2239 BUG_TRAP(newsk->state != TCP_SYN_RECV);
2250 * Socket option code for TCP.
2253 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2256 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2260 if (level != SOL_TCP)
2261 return tp->af_specific->setsockopt(sk, level, optname,
2264 if(optlen<sizeof(int))
2267 if (get_user(val, (int *)optval))
2274 /* values greater than interface MTU won't take effect. however at
2275 * the point when this call is done we typically don't yet know
2276 * which interface is going to be used
2278 if(val < 8 || val > MAX_TCP_WINDOW) {
2286 /* You cannot try to use this and TCP_CORK in
2287 * tandem, so let the user know.
2289 if (tp->nonagle == 2) {
2293 tp->nonagle = (val == 0) ? 0 : 1;
2295 tcp_push_pending_frames(sk, tp);
2299 /* When set indicates to always queue non-full frames.
2300 * Later the user clears this option and we transmit
2301 * any pending partial frames in the queue. This is
2302 * meant to be used alongside sendfile() to get properly
2303 * filled frames when the user (for example) must write
2304 * out headers with a write() call first and then use
2305 * sendfile to send out the data parts.
2307 * You cannot try to use TCP_NODELAY and this mechanism
2308 * at the same time, so let the user know.
2310 if (tp->nonagle == 1) {
2319 tcp_push_pending_frames(sk, tp);
2324 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2327 tp->keepalive_time = val * HZ;
2328 if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2329 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2330 if (tp->keepalive_time > elapsed)
2331 elapsed = tp->keepalive_time - elapsed;
2334 tcp_reset_keepalive_timer(sk, elapsed);
2339 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2342 tp->keepalive_intvl = val * HZ;
2345 if (val < 1 || val > MAX_TCP_KEEPCNT)
2348 tp->keepalive_probes = val;
2351 if (val < 1 || val > MAX_TCP_SYNCNT)
2354 tp->syn_retries = val;
2360 else if (val > sysctl_tcp_fin_timeout/HZ)
2363 tp->linger2 = val*HZ;
2366 case TCP_DEFER_ACCEPT:
2367 tp->defer_accept = 0;
2369 /* Translate value in seconds to number of retransmits */
2370 while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2376 case TCP_WINDOW_CLAMP:
2378 if (sk->state != TCP_CLOSE) {
2382 tp->window_clamp = 0;
2384 tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2385 SOCK_MIN_RCVBUF/2 : val;
2391 tp->ack.pingpong = 1;
2393 tp->ack.pingpong = 0;
2394 if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2395 tcp_ack_scheduled(tp)) {
2396 tp->ack.pending |= TCP_ACK_PUSHED;
2397 cleanup_rbuf(sk, 1);
2399 tp->ack.pingpong = 1;
2412 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2415 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2418 if(level != SOL_TCP)
2419 return tp->af_specific->getsockopt(sk, level, optname,
2422 if(get_user(len,optlen))
2425 len = min_t(unsigned int, len, sizeof(int));
2432 val = tp->mss_cache;
2433 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2437 val = (tp->nonagle == 1);
2440 val = (tp->nonagle == 2);
2443 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2446 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2449 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2452 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2457 val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2459 case TCP_DEFER_ACCEPT:
2460 val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2462 case TCP_WINDOW_CLAMP:
2463 val = tp->window_clamp;
2467 struct tcp_info info;
2468 u32 now = tcp_time_stamp;
2470 if(get_user(len,optlen))
2472 info.tcpi_state = sk->state;
2473 info.tcpi_ca_state = tp->ca_state;
2474 info.tcpi_retransmits = tp->retransmits;
2475 info.tcpi_probes = tp->probes_out;
2476 info.tcpi_backoff = tp->backoff;
2477 info.tcpi_options = 0;
2479 info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2481 info.tcpi_options |= TCPI_OPT_SACK;
2482 if (tp->wscale_ok) {
2483 info.tcpi_options |= TCPI_OPT_WSCALE;
2484 info.tcpi_snd_wscale = tp->snd_wscale;
2485 info.tcpi_rcv_wscale = tp->rcv_wscale;
2487 info.tcpi_snd_wscale = 0;
2488 info.tcpi_rcv_wscale = 0;
2490 if (tp->ecn_flags&TCP_ECN_OK)
2491 info.tcpi_options |= TCPI_OPT_ECN;
2493 info.tcpi_rto = (1000000*tp->rto)/HZ;
2494 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2495 info.tcpi_snd_mss = tp->mss_cache;
2496 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2498 info.tcpi_unacked = tp->packets_out;
2499 info.tcpi_sacked = tp->sacked_out;
2500 info.tcpi_lost = tp->lost_out;
2501 info.tcpi_retrans = tp->retrans_out;
2502 info.tcpi_fackets = tp->fackets_out;
2504 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2505 info.tcpi_last_ack_sent = 0;
2506 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2507 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2509 info.tcpi_pmtu = tp->pmtu_cookie;
2510 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2511 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2512 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2513 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2514 info.tcpi_snd_cwnd = tp->snd_cwnd;
2515 info.tcpi_advmss = tp->advmss;
2516 info.tcpi_reordering = tp->reordering;
2518 len = min_t(unsigned int, len, sizeof(info));
2519 if(put_user(len, optlen))
2521 if(copy_to_user(optval, &info,len))
2526 val = !tp->ack.pingpong;
2529 return -ENOPROTOOPT;
2532 if(put_user(len, optlen))
2534 if(copy_to_user(optval, &val,len))
2540 extern void __skb_cb_too_small_for_tcp(int, int);
2541 extern void tcpdiag_init(void);
2543 void __init tcp_init(void)
2545 struct sk_buff *skb = NULL;
2549 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2550 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2553 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2554 sizeof(struct open_request),
2555 0, SLAB_HWCACHE_ALIGN,
2557 if(!tcp_openreq_cachep)
2558 panic("tcp_init: Cannot alloc open_request cache.");
2560 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2561 sizeof(struct tcp_bind_bucket),
2562 0, SLAB_HWCACHE_ALIGN,
2564 if(!tcp_bucket_cachep)
2565 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2567 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2568 sizeof(struct tcp_tw_bucket),
2569 0, SLAB_HWCACHE_ALIGN,
2571 if(!tcp_timewait_cachep)
2572 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2574 /* Size and allocate the main established and bind bucket
2577 * The methodology is similar to that of the buffer cache.
2579 if (num_physpages >= (128 * 1024))
2580 goal = num_physpages >> (21 - PAGE_SHIFT);
2582 goal = num_physpages >> (23 - PAGE_SHIFT);
2584 for(order = 0; (1UL << order) < goal; order++)
2587 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2588 sizeof(struct tcp_ehash_bucket);
2589 tcp_ehash_size >>= 1;
2590 while (tcp_ehash_size & (tcp_ehash_size-1))
2592 tcp_ehash = (struct tcp_ehash_bucket *)
2593 __get_free_pages(GFP_ATOMIC, order);
2594 } while (tcp_ehash == NULL && --order > 0);
2597 panic("Failed to allocate TCP established hash table\n");
2598 for (i = 0; i < (tcp_ehash_size<<1); i++) {
2599 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2600 tcp_ehash[i].chain = NULL;
2604 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2605 sizeof(struct tcp_bind_hashbucket);
2606 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2608 tcp_bhash = (struct tcp_bind_hashbucket *)
2609 __get_free_pages(GFP_ATOMIC, order);
2610 } while (tcp_bhash == NULL && --order >= 0);
2613 panic("Failed to allocate TCP bind hash table\n");
2614 for (i = 0; i < tcp_bhash_size; i++) {
2615 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2616 tcp_bhash[i].chain = NULL;
2619 /* Try to be a bit smarter and adjust defaults depending
2620 * on available memory.
2623 sysctl_local_port_range[0] = 32768;
2624 sysctl_local_port_range[1] = 61000;
2625 sysctl_tcp_max_tw_buckets = 180000;
2626 sysctl_tcp_max_orphans = 4096<<(order-4);
2627 sysctl_max_syn_backlog = 1024;
2628 } else if (order < 3) {
2629 sysctl_local_port_range[0] = 1024*(3-order);
2630 sysctl_tcp_max_tw_buckets >>= (3-order);
2631 sysctl_tcp_max_orphans >>= (3-order);
2632 sysctl_max_syn_backlog = 128;
2634 tcp_port_rover = sysctl_local_port_range[0] - 1;
2636 sysctl_tcp_mem[0] = 768<<order;
2637 sysctl_tcp_mem[1] = 1024<<order;
2638 sysctl_tcp_mem[2] = 1536<<order;
2641 sysctl_tcp_wmem[2] = 64*1024;
2642 sysctl_tcp_rmem[0] = PAGE_SIZE;
2643 sysctl_tcp_rmem[1] = 43689;
2644 sysctl_tcp_rmem[2] = 2*43689;
2647 printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2648 tcp_ehash_size<<1, tcp_bhash_size);
2650 (void) tcp_mib_init();