cleanup
[linux-2.4.21-pre4.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.1.1.1 2005/04/11 02:51:13 jack Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but its a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *                                      
208  *              This program is free software; you can redistribute it and/or
209  *              modify it under the terms of the GNU General Public License
210  *              as published by the Free Software Foundation; either version
211  *              2 of the License, or(at your option) any later version.
212  *
213  * Description of States:
214  *
215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
216  *
217  *      TCP_SYN_RECV            received a connection request, sent ack,
218  *                              waiting for final ack in three-way handshake.
219  *
220  *      TCP_ESTABLISHED         connection established
221  *
222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
223  *                              transmission of remaining buffered data
224  *
225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
226  *                              to shutdown
227  *
228  *      TCP_CLOSING             both sides have shutdown but we still have
229  *                              data we have to finish sending
230  *
231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
232  *                              closed, can only be entered from FIN_WAIT2
233  *                              or CLOSING.  Required because the other end
234  *                              may not have gotten our last ACK causing it
235  *                              to retransmit the data packet (which we ignore)
236  *
237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
238  *                              us to finish writing our data and to shutdown
239  *                              (we have to close() to move on to LAST_ACK)
240  *
241  *      TCP_LAST_ACK            out side has shutdown after remote has
242  *                              shutdown.  There may still be data in our
243  *                              buffer that we have to finish sending
244  *
245  *      TCP_CLOSE               socket is finished
246  */
247
248 #include <linux/config.h>
249 #include <linux/types.h>
250 #include <linux/fcntl.h>
251 #include <linux/poll.h>
252 #include <linux/init.h>
253 #include <linux/smp_lock.h>
254 #include <linux/fs.h>
255
256 #include <net/icmp.h>
257 #include <net/tcp.h>
258
259 #include <asm/uaccess.h>
260 #include <asm/ioctls.h>
261
262 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
263
264 struct tcp_mib  tcp_statistics[NR_CPUS*2];
265
266 kmem_cache_t *tcp_openreq_cachep;
267 kmem_cache_t *tcp_bucket_cachep;
268 kmem_cache_t *tcp_timewait_cachep;
269
270 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
271
272 int sysctl_tcp_mem[3];
273 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
274 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
275
276 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
277 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
278
279 /* Pressure flag: try to collapse.
280  * Technical note: it is used by multiple contexts non atomically.
281  * All the tcp_mem_schedule() is of this nature: accounting
282  * is strict, actions are advisory and have some latency. */
283 int tcp_memory_pressure;
284
285 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
286
287 int tcp_mem_schedule(struct sock *sk, int size, int kind)
288 {
289         int amt = TCP_PAGES(size);
290
291         sk->forward_alloc += amt*TCP_MEM_QUANTUM;
292         atomic_add(amt, &tcp_memory_allocated);
293
294         /* Under limit. */
295         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
296                 if (tcp_memory_pressure)
297                         tcp_memory_pressure = 0;
298                 return 1;
299         }
300
301         /* Over hard limit. */
302         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
303                 tcp_enter_memory_pressure();
304                 goto suppress_allocation;
305         }
306
307         /* Under pressure. */
308         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
309                 tcp_enter_memory_pressure();
310
311         if (kind) {
312                 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
313                         return 1;
314         } else {
315                 if (sk->wmem_queued < sysctl_tcp_wmem[0])
316                         return 1;
317         }
318
319         if (!tcp_memory_pressure ||
320             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
321             * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
322                         sk->forward_alloc))
323                 return 1;
324
325 suppress_allocation:
326
327         if (kind == 0) {
328                 tcp_moderate_sndbuf(sk);
329
330                 /* Fail only if socket is _under_ its sndbuf.
331                  * In this case we cannot block, so that we have to fail.
332                  */
333                 if (sk->wmem_queued+size >= sk->sndbuf)
334                         return 1;
335         }
336
337         /* Alas. Undo changes. */
338         sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
339         atomic_sub(amt, &tcp_memory_allocated);
340         return 0;
341 }
342
343 void __tcp_mem_reclaim(struct sock *sk)
344 {
345         if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
346                 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
347                 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
348                 if (tcp_memory_pressure &&
349                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
350                         tcp_memory_pressure = 0;
351         }
352 }
353
354 void tcp_rfree(struct sk_buff *skb)
355 {
356         struct sock *sk = skb->sk;
357
358         atomic_sub(skb->truesize, &sk->rmem_alloc);
359         sk->forward_alloc += skb->truesize;
360 }
361
362 /*
363  * LISTEN is a special case for poll..
364  */
365 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
366 {
367         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
368 }
369
370 /*
371  *      Wait for a TCP event.
372  *
373  *      Note that we don't need to lock the socket, as the upper poll layers
374  *      take care of normal races (between the test and the event) and we don't
375  *      go look at any of the socket buffers directly.
376  */
377 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
378 {
379         unsigned int mask;
380         struct sock *sk = sock->sk;
381         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
382
383         poll_wait(file, sk->sleep, wait);
384         if (sk->state == TCP_LISTEN)
385                 return tcp_listen_poll(sk, wait);
386
387         /* Socket is not locked. We are protected from async events
388            by poll logic and correct handling of state changes
389            made by another threads is impossible in any case.
390          */
391
392         mask = 0;
393         if (sk->err)
394                 mask = POLLERR;
395
396         /*
397          * POLLHUP is certainly not done right. But poll() doesn't
398          * have a notion of HUP in just one direction, and for a
399          * socket the read side is more interesting.
400          *
401          * Some poll() documentation says that POLLHUP is incompatible
402          * with the POLLOUT/POLLWR flags, so somebody should check this
403          * all. But careful, it tends to be safer to return too many
404          * bits than too few, and you can easily break real applications
405          * if you don't tell them that something has hung up!
406          *
407          * Check-me.
408          *
409          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
410          * our fs/select.c). It means that after we received EOF,
411          * poll always returns immediately, making impossible poll() on write()
412          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
413          * if and only if shutdown has been made in both directions.
414          * Actually, it is interesting to look how Solaris and DUX
415          * solve this dilemma. I would prefer, if PULLHUP were maskable,
416          * then we could set it on SND_SHUTDOWN. BTW examples given
417          * in Stevens' books assume exactly this behaviour, it explains
418          * why PULLHUP is incompatible with POLLOUT.    --ANK
419          *
420          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
421          * blocking on fresh not-connected or disconnected socket. --ANK
422          */
423         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
424                 mask |= POLLHUP;
425         if (sk->shutdown & RCV_SHUTDOWN)
426                 mask |= POLLIN | POLLRDNORM;
427
428         /* Connected? */
429         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
430                 /* Potential race condition. If read of tp below will
431                  * escape above sk->state, we can be illegally awaken
432                  * in SYN_* states. */
433                 if ((tp->rcv_nxt != tp->copied_seq) &&
434                     (tp->urg_seq != tp->copied_seq ||
435                      tp->rcv_nxt != tp->copied_seq+1 ||
436                      sk->urginline || !tp->urg_data))
437                         mask |= POLLIN | POLLRDNORM;
438
439                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
440                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
441                                 mask |= POLLOUT | POLLWRNORM;
442                         } else {  /* send SIGIO later */
443                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
444                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
445
446                                 /* Race breaker. If space is freed after
447                                  * wspace test but before the flags are set,
448                                  * IO signal will be lost.
449                                  */
450                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
451                                         mask |= POLLOUT | POLLWRNORM;
452                         }
453                 }
454
455                 if (tp->urg_data & TCP_URG_VALID)
456                         mask |= POLLPRI;
457         }
458         return mask;
459 }
460
461 /*
462  *      TCP socket write_space callback.
463  */
464 void tcp_write_space(struct sock *sk)
465 {
466         struct socket *sock = sk->socket;
467
468         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
469                 clear_bit(SOCK_NOSPACE, &sock->flags);
470
471                 if (sk->sleep && waitqueue_active(sk->sleep))
472                         wake_up_interruptible(sk->sleep);
473
474                 if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
475                         sock_wake_async(sock, 2, POLL_OUT);
476         }
477 }
478
479 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
480 {
481         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
482         int answ;
483
484         switch(cmd) {
485         case SIOCINQ:
486                 if (sk->state == TCP_LISTEN)
487                         return(-EINVAL);
488
489                 lock_sock(sk);
490                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
491                         answ = 0;
492                 else if (sk->urginline || !tp->urg_data ||
493                          before(tp->urg_seq,tp->copied_seq) ||
494                          !before(tp->urg_seq,tp->rcv_nxt)) {
495                         answ = tp->rcv_nxt - tp->copied_seq;
496
497                         /* Subtract 1, if FIN is in queue. */
498                         if (answ && !skb_queue_empty(&sk->receive_queue))
499                                 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
500                 } else
501                         answ = tp->urg_seq - tp->copied_seq;
502                 release_sock(sk);
503                 break;
504         case SIOCATMARK:
505                 {
506                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
507                         break;
508                 }
509         case SIOCOUTQ:
510                 if (sk->state == TCP_LISTEN)
511                         return(-EINVAL);
512
513                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
514                         answ = 0;
515                 else
516                         answ = tp->write_seq - tp->snd_una;
517                 break;
518         default:
519                 return(-ENOIOCTLCMD);
520         };
521
522         return put_user(answ, (int *)arg);
523 }
524
525
526 int tcp_listen_start(struct sock *sk)
527 {
528         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
529         struct tcp_listen_opt *lopt;
530
531         sk->max_ack_backlog = 0;
532         sk->ack_backlog = 0;
533         tp->accept_queue = tp->accept_queue_tail = NULL;
534         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
535         tcp_delack_init(tp);
536
537         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
538         if (!lopt)
539                 return -ENOMEM;
540
541         memset(lopt, 0, sizeof(struct tcp_listen_opt));
542         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
543                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
544                         break;
545
546         write_lock_bh(&tp->syn_wait_lock);
547         tp->listen_opt = lopt;
548         write_unlock_bh(&tp->syn_wait_lock);
549
550         /* There is race window here: we announce ourselves listening,
551          * but this transition is still not validated by get_port().
552          * It is OK, because this socket enters to hash table only
553          * after validation is complete.
554          */
555         sk->state = TCP_LISTEN;
556         if (sk->prot->get_port(sk, sk->num) == 0) {
557                 sk->sport = htons(sk->num);
558
559                 sk_dst_reset(sk);
560                 sk->prot->hash(sk);
561
562                 return 0;
563         }
564
565         sk->state = TCP_CLOSE;
566         write_lock_bh(&tp->syn_wait_lock);
567         tp->listen_opt = NULL;
568         write_unlock_bh(&tp->syn_wait_lock);
569         kfree(lopt);
570         return -EADDRINUSE;
571 }
572
573 /*
574  *      This routine closes sockets which have been at least partially
575  *      opened, but not yet accepted.
576  */
577
578 static void tcp_listen_stop (struct sock *sk)
579 {
580         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
581         struct tcp_listen_opt *lopt = tp->listen_opt;
582         struct open_request *acc_req = tp->accept_queue;
583         struct open_request *req;
584         int i;
585
586         tcp_delete_keepalive_timer(sk);
587
588         /* make all the listen_opt local to us */
589         write_lock_bh(&tp->syn_wait_lock);
590         tp->listen_opt =NULL;
591         write_unlock_bh(&tp->syn_wait_lock);
592         tp->accept_queue = tp->accept_queue_tail = NULL;
593
594         if (lopt->qlen) {
595                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
596                         while ((req = lopt->syn_table[i]) != NULL) {
597                                 lopt->syn_table[i] = req->dl_next;
598                                 lopt->qlen--;
599                                 tcp_openreq_free(req);
600
601                 /* Following specs, it would be better either to send FIN
602                  * (and enter FIN-WAIT-1, it is normal close)
603                  * or to send active reset (abort). 
604                  * Certainly, it is pretty dangerous while synflood, but it is
605                  * bad justification for our negligence 8)
606                  * To be honest, we are not able to make either
607                  * of the variants now.                 --ANK
608                  */
609                         }
610                 }
611         }
612         BUG_TRAP(lopt->qlen == 0);
613
614         kfree(lopt);
615
616         while ((req=acc_req) != NULL) {
617                 struct sock *child = req->sk;
618
619                 acc_req = req->dl_next;
620
621                 local_bh_disable();
622                 bh_lock_sock(child);
623                 BUG_TRAP(child->lock.users==0);
624                 sock_hold(child);
625
626                 tcp_disconnect(child, O_NONBLOCK);
627
628                 sock_orphan(child);
629
630                 atomic_inc(&tcp_orphan_count);
631
632                 tcp_destroy_sock(child);
633
634                 bh_unlock_sock(child);
635                 local_bh_enable();
636                 sock_put(child);
637
638                 tcp_acceptq_removed(sk);
639                 tcp_openreq_fastfree(req);
640         }
641         BUG_TRAP(sk->ack_backlog == 0);
642 }
643
644 /*
645  *      Wait for a socket to get into the connected state
646  *
647  *      Note: Must be called with the socket locked.
648  */
649 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
650 {
651         struct task_struct *tsk = current;
652         DECLARE_WAITQUEUE(wait, tsk);
653
654         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
655                 if(sk->err)
656                         return sock_error(sk);
657                 if((1 << sk->state) &
658                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
659                         return -EPIPE;
660                 if(!*timeo_p)
661                         return -EAGAIN;
662                 if(signal_pending(tsk))
663                         return sock_intr_errno(*timeo_p);
664
665                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
666                 add_wait_queue(sk->sleep, &wait);
667                 sk->tp_pinfo.af_tcp.write_pending++;
668
669                 release_sock(sk);
670                 *timeo_p = schedule_timeout(*timeo_p);
671                 lock_sock(sk);
672
673                 __set_task_state(tsk, TASK_RUNNING);
674                 remove_wait_queue(sk->sleep, &wait);
675                 sk->tp_pinfo.af_tcp.write_pending--;
676         }
677         return 0;
678 }
679
680 static inline int tcp_memory_free(struct sock *sk)
681 {
682         return sk->wmem_queued < sk->sndbuf;
683 }
684
685 /*
686  *      Wait for more memory for a socket
687  */
688 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
689 {
690         int err = 0;
691         long vm_wait = 0;
692         long current_timeo = *timeo;
693         DECLARE_WAITQUEUE(wait, current);
694
695         if (tcp_memory_free(sk))
696                 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
697
698         add_wait_queue(sk->sleep, &wait);
699         for (;;) {
700                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
701
702                 set_current_state(TASK_INTERRUPTIBLE);
703
704                 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
705                         goto do_error;
706                 if (!*timeo)
707                         goto do_nonblock;
708                 if (signal_pending(current))
709                         goto do_interrupted;
710                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
711                 if (tcp_memory_free(sk) && !vm_wait)
712                         break;
713
714                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
715                 sk->tp_pinfo.af_tcp.write_pending++;
716                 release_sock(sk);
717                 if (!tcp_memory_free(sk) || vm_wait)
718                         current_timeo = schedule_timeout(current_timeo);
719                 lock_sock(sk);
720                 sk->tp_pinfo.af_tcp.write_pending--;
721
722                 if (vm_wait) {
723                         vm_wait -= current_timeo;
724                         current_timeo = *timeo;
725                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
726                             (current_timeo -= vm_wait) < 0)
727                                 current_timeo = 0;
728                         vm_wait = 0;
729                 }
730                 *timeo = current_timeo;
731         }
732 out:
733         current->state = TASK_RUNNING;
734         remove_wait_queue(sk->sleep, &wait);
735         return err;
736
737 do_error:
738         err = -EPIPE;
739         goto out;
740 do_nonblock:
741         err = -EAGAIN;
742         goto out;
743 do_interrupted:
744         err = sock_intr_errno(*timeo);
745         goto out;
746 }
747
748 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
749
750 static inline int
751 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
752 {
753         if (i) {
754                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
755                 return page == frag->page &&
756                         off == frag->page_offset+frag->size;
757         }
758         return 0;
759 }
760
761 static inline void
762 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
763 {
764         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
765         frag->page = page;
766         frag->page_offset = off;
767         frag->size = size;
768         skb_shinfo(skb)->nr_frags = i+1;
769 }
770
771 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
772 {
773         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
774         tp->pushed_seq = tp->write_seq;
775 }
776
777 static inline int forced_push(struct tcp_opt *tp)
778 {
779         return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
780 }
781
782 static inline void
783 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
784 {
785         skb->csum = 0;
786         TCP_SKB_CB(skb)->seq = tp->write_seq;
787         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
788         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
789         TCP_SKB_CB(skb)->sacked = 0;
790         __skb_queue_tail(&sk->write_queue, skb);
791         tcp_charge_skb(sk, skb);
792         if (tp->send_head == NULL)
793                 tp->send_head = skb;
794 }
795
796 static inline void
797 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
798 {
799         if (flags & MSG_OOB) {
800                 tp->urg_mode = 1;
801                 tp->snd_up = tp->write_seq;
802                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
803         }
804 }
805
806 static inline void
807 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
808 {
809         if (tp->send_head) {
810                 struct sk_buff *skb = sk->write_queue.prev;
811                 if (!(flags&MSG_MORE) || forced_push(tp))
812                         tcp_mark_push(tp, skb);
813                 tcp_mark_urg(tp, flags, skb);
814                 __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
815         }
816 }
817
818 static int tcp_error(struct sock *sk, int flags, int err)
819 {
820         if (err == -EPIPE)
821                 err = sock_error(sk) ? : -EPIPE;
822         if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
823                 send_sig(SIGPIPE, current, 0);
824         return err;
825 }
826
827 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
828 {
829         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
830         int mss_now;
831         int err;
832         ssize_t copied;
833         long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
834
835         /* Wait for a connection to finish. */
836         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
837                 if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
838                         goto out_err;
839
840         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
841
842         mss_now = tcp_current_mss(sk);
843         copied = 0;
844
845         err = -EPIPE;
846         if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
847                 goto do_error;
848
849         while (psize > 0) {
850                 struct sk_buff *skb = sk->write_queue.prev;
851                 int offset, size, copy, i;
852                 struct page *page;
853
854                 page = pages[poffset/PAGE_SIZE];
855                 offset = poffset % PAGE_SIZE;
856                 size = min_t(size_t, psize, PAGE_SIZE-offset);
857
858                 if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
859 new_segment:
860                         if (!tcp_memory_free(sk))
861                                 goto wait_for_sndbuf;
862
863                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
864                         if (skb == NULL)
865                                 goto wait_for_memory;
866
867                         skb_entail(sk, tp, skb);
868                         copy = mss_now;
869                 }
870
871                 if (copy > size)
872                         copy = size;
873
874                 i = skb_shinfo(skb)->nr_frags;
875                 if (can_coalesce(skb, i, page, offset)) {
876                         skb_shinfo(skb)->frags[i-1].size += copy;
877                 } else if (i < MAX_SKB_FRAGS) {
878                         get_page(page);
879                         fill_page_desc(skb, i, page, offset, copy);
880                 } else {
881                         tcp_mark_push(tp, skb);
882                         goto new_segment;
883                 }
884
885                 skb->len += copy;
886                 skb->data_len += copy;
887                 skb->ip_summed = CHECKSUM_HW;
888                 tp->write_seq += copy;
889                 TCP_SKB_CB(skb)->end_seq += copy;
890
891                 if (!copied)
892                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
893
894                 copied += copy;
895                 poffset += copy;
896                 if (!(psize -= copy))
897                         goto out;
898
899                 if (skb->len != mss_now || (flags&MSG_OOB))
900                         continue;
901
902                 if (forced_push(tp)) {
903                         tcp_mark_push(tp, skb);
904                         __tcp_push_pending_frames(sk, tp, mss_now, 1);
905                 } else if (skb == tp->send_head)
906                         tcp_push_one(sk, mss_now);
907                 continue;
908
909 wait_for_sndbuf:
910                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
911 wait_for_memory:
912                 if (copied)
913                         tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
914
915                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
916                         goto do_error;
917
918                 mss_now = tcp_current_mss(sk);
919         }
920
921 out:
922         if (copied)
923                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
924         return copied;
925
926 do_error:
927         if (copied)
928                 goto out;
929 out_err:
930         return tcp_error(sk, flags, err);
931 }
932
933 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
934 {
935         ssize_t res;
936         struct sock *sk = sock->sk;
937
938 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
939
940         if (!(sk->route_caps & NETIF_F_SG) || 
941             !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
942                 return sock_no_sendpage(sock, page, offset, size, flags);
943
944 #undef TCP_ZC_CSUM_FLAGS
945
946         lock_sock(sk);
947         TCP_CHECK_TIMER(sk);
948         res = do_tcp_sendpages(sk, &page, offset, size, flags);
949         TCP_CHECK_TIMER(sk);
950         release_sock(sk);
951         return res;
952 }
953
954 #define TCP_PAGE(sk)    (sk->tp_pinfo.af_tcp.sndmsg_page)
955 #define TCP_OFF(sk)     (sk->tp_pinfo.af_tcp.sndmsg_off)
956
957 static inline int
958 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
959                  struct page *page, int off, int copy)
960 {
961         int err = 0;
962         unsigned int csum;
963
964         csum = csum_and_copy_from_user(from, page_address(page)+off,
965                                        copy, 0, &err);
966         if (!err) {
967                 if (skb->ip_summed == CHECKSUM_NONE)
968                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
969                 skb->len += copy;
970                 skb->data_len += copy;
971                 skb->truesize += copy;
972                 sk->wmem_queued += copy;
973                 sk->forward_alloc -= copy;
974         }
975         return err;
976 }
977
978 static inline int
979 skb_add_data(struct sk_buff *skb, char *from, int copy)
980 {
981         int err = 0;
982         unsigned int csum;
983         int off = skb->len;
984
985         csum = csum_and_copy_from_user(from, skb_put(skb, copy),
986                                        copy, 0, &err);
987         if (!err) {
988                 skb->csum = csum_block_add(skb->csum, csum, off);
989                 return 0;
990         }
991
992         __skb_trim(skb, off);
993         return -EFAULT;
994 }
995
996 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
997 {
998         int tmp = tp->mss_cache;
999
1000         if (sk->route_caps&NETIF_F_SG) {
1001                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1002
1003                 if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1004                         tmp = pgbreak;
1005         }
1006         return tmp;
1007 }
1008
1009 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1010 {
1011         struct iovec *iov;
1012         struct tcp_opt *tp;
1013         struct sk_buff *skb;
1014         int iovlen, flags;
1015         int mss_now;
1016         int err, copied;
1017         long timeo;
1018
1019         tp = &(sk->tp_pinfo.af_tcp);
1020
1021         lock_sock(sk);
1022         TCP_CHECK_TIMER(sk);
1023
1024         flags = msg->msg_flags;
1025         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1026
1027         /* Wait for a connection to finish. */
1028         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1029                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1030                         goto out_err;
1031
1032         /* This should be in poll */
1033         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1034
1035         mss_now = tcp_current_mss(sk);
1036
1037         /* Ok commence sending. */
1038         iovlen = msg->msg_iovlen;
1039         iov = msg->msg_iov;
1040         copied = 0;
1041
1042         err = -EPIPE;
1043         if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1044                 goto do_error;
1045
1046         while (--iovlen >= 0) {
1047                 int seglen=iov->iov_len;
1048                 unsigned char * from=iov->iov_base;
1049
1050                 iov++;
1051
1052                 while (seglen > 0) {
1053                         int copy;
1054                         
1055                         skb = sk->write_queue.prev;
1056
1057                         if (tp->send_head == NULL ||
1058                             (copy = mss_now - skb->len) <= 0) {
1059
1060 new_segment:
1061                                 /* Allocate new segment. If the interface is SG,
1062                                  * allocate skb fitting to single page.
1063                                  */
1064                                 if (!tcp_memory_free(sk))
1065                                         goto wait_for_sndbuf;
1066
1067                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1068                                 if (skb == NULL)
1069                                         goto wait_for_memory;
1070
1071                                 skb_entail(sk, tp, skb);
1072                                 copy = mss_now;
1073                         }
1074
1075                         /* Try to append data to the end of skb. */
1076                         if (copy > seglen)
1077                                 copy = seglen;
1078
1079                         /* Where to copy to? */
1080                         if (skb_tailroom(skb) > 0) {
1081                                 /* We have some space in skb head. Superb! */
1082                                 if (copy > skb_tailroom(skb))
1083                                         copy = skb_tailroom(skb);
1084                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1085                                         goto do_fault;
1086                         } else {
1087                                 int merge = 0;
1088                                 int i = skb_shinfo(skb)->nr_frags;
1089                                 struct page *page = TCP_PAGE(sk);
1090                                 int off = TCP_OFF(sk);
1091
1092                                 if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1093                                         /* We can extend the last page fragment. */
1094                                         merge = 1;
1095                                 } else if (i == MAX_SKB_FRAGS ||
1096                                            (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1097                                         /* Need to add new fragment and cannot
1098                                          * do this because interface is non-SG,
1099                                          * or because all the page slots are busy.
1100                                          */
1101                                         tcp_mark_push(tp, skb);
1102                                         goto new_segment;
1103                                 } else if (page) {
1104                                         /* If page is cached, align
1105                                          * offset to L1 cache boundary
1106                                          */
1107                                         off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1108                                         if (off == PAGE_SIZE) {
1109                                                 put_page(page);
1110                                                 TCP_PAGE(sk) = page = NULL;
1111                                         }
1112                                 }
1113
1114                                 if (!page) {
1115                                         /* Allocate new cache page. */
1116                                         if (!(page=tcp_alloc_page(sk)))
1117                                                 goto wait_for_memory;
1118                                         off = 0;
1119                                 }
1120
1121                                 if (copy > PAGE_SIZE-off)
1122                                         copy = PAGE_SIZE-off;
1123
1124                                 /* Time to copy data. We are close to the end! */
1125                                 err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1126                                 if (err) {
1127                                         /* If this page was new, give it to the
1128                                          * socket so it does not get leaked.
1129                                          */
1130                                         if (TCP_PAGE(sk) == NULL) {
1131                                                 TCP_PAGE(sk) = page;
1132                                                 TCP_OFF(sk) = 0;
1133                                         }
1134                                         goto do_error;
1135                                 }
1136
1137                                 /* Update the skb. */
1138                                 if (merge) {
1139                                         skb_shinfo(skb)->frags[i-1].size += copy;
1140                                 } else {
1141                                         fill_page_desc(skb, i, page, off, copy);
1142                                         if (TCP_PAGE(sk)) {
1143                                                 get_page(page);
1144                                         } else if (off + copy < PAGE_SIZE) {
1145                                                 get_page(page);
1146                                                 TCP_PAGE(sk) = page;
1147                                         }
1148                                 }
1149
1150                                 TCP_OFF(sk) = off+copy;
1151                         }
1152
1153                         if (!copied)
1154                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1155
1156                         tp->write_seq += copy;
1157                         TCP_SKB_CB(skb)->end_seq += copy;
1158
1159                         from += copy;
1160                         copied += copy;
1161                         if ((seglen -= copy) == 0 && iovlen == 0)
1162                                 goto out;
1163
1164                         if (skb->len != mss_now || (flags&MSG_OOB))
1165                                 continue;
1166
1167                         if (forced_push(tp)) {
1168                                 tcp_mark_push(tp, skb);
1169                                 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1170                         } else if (skb == tp->send_head)
1171                                 tcp_push_one(sk, mss_now);
1172                         continue;
1173
1174 wait_for_sndbuf:
1175                         set_bit(SOCK_NOSPACE, &sk->socket->flags);
1176 wait_for_memory:
1177                         if (copied)
1178                                 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1179
1180                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1181                                 goto do_error;
1182
1183                         mss_now = tcp_current_mss(sk);
1184                 }
1185         }
1186
1187 out:
1188         if (copied)
1189                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1190         TCP_CHECK_TIMER(sk);
1191         release_sock(sk);
1192         return copied;
1193
1194 do_fault:
1195         if (skb->len == 0) {
1196                 if (tp->send_head == skb)
1197                         tp->send_head = NULL;
1198                 __skb_unlink(skb, skb->list);
1199                 tcp_free_skb(sk, skb);
1200         }
1201
1202 do_error:
1203         if (copied)
1204                 goto out;
1205 out_err:
1206         err = tcp_error(sk, flags, err);
1207         TCP_CHECK_TIMER(sk);
1208         release_sock(sk);
1209         return err;
1210 }
1211
1212 /*
1213  *      Handle reading urgent data. BSD has very simple semantics for
1214  *      this, no blocking and very strange errors 8)
1215  */
1216
1217 static int tcp_recv_urg(struct sock * sk, long timeo,
1218                         struct msghdr *msg, int len, int flags, 
1219                         int *addr_len)
1220 {
1221         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1222
1223         /* No URG data to read. */
1224         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1225                 return -EINVAL; /* Yes this is right ! */
1226
1227         if (sk->state==TCP_CLOSE && !sk->done)
1228                 return -ENOTCONN;
1229
1230         if (tp->urg_data & TCP_URG_VALID) {
1231                 int err = 0; 
1232                 char c = tp->urg_data;
1233
1234                 if (!(flags & MSG_PEEK))
1235                         tp->urg_data = TCP_URG_READ;
1236
1237                 /* Read urgent data. */
1238                 msg->msg_flags|=MSG_OOB;
1239
1240                 if(len>0) {
1241                         if (!(flags & MSG_TRUNC))
1242                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1243                         len = 1;
1244                 } else
1245                         msg->msg_flags|=MSG_TRUNC;
1246
1247                 return err ? -EFAULT : len;
1248         }
1249
1250         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1251                 return 0;
1252
1253         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1254          * the available implementations agree in this case:
1255          * this call should never block, independent of the
1256          * blocking state of the socket.
1257          * Mike <pall@rz.uni-karlsruhe.de>
1258          */
1259         return -EAGAIN;
1260 }
1261
1262 /*
1263  *      Release a skb if it is no longer needed. This routine
1264  *      must be called with interrupts disabled or with the
1265  *      socket locked so that the sk_buff queue operation is ok.
1266  */
1267
1268 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1269 {
1270         __skb_unlink(skb, &sk->receive_queue);
1271         __kfree_skb(skb);
1272 }
1273
1274 /* Clean up the receive buffer for full frames taken by the user,
1275  * then send an ACK if necessary.  COPIED is the number of bytes
1276  * tcp_recvmsg has given to the user so far, it speeds up the
1277  * calculation of whether or not we must ACK for the sake of
1278  * a window update.
1279  */
1280 static void cleanup_rbuf(struct sock *sk, int copied)
1281 {
1282         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1283         int time_to_ack = 0;
1284
1285 #if TCP_DEBUG
1286         struct sk_buff *skb = skb_peek(&sk->receive_queue);
1287
1288         BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1289 #endif
1290
1291         if (tcp_ack_scheduled(tp)) {
1292                    /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1293                 if (tp->ack.blocked
1294                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1295                     || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1296                     /*
1297                      * If this read emptied read buffer, we send ACK, if
1298                      * connection is not bidirectional, user drained
1299                      * receive buffer and there was a small segment
1300                      * in queue.
1301                      */
1302                     || (copied > 0 &&
1303                         (tp->ack.pending&TCP_ACK_PUSHED) &&
1304                         !tp->ack.pingpong &&
1305                         atomic_read(&sk->rmem_alloc) == 0)) {
1306                         time_to_ack = 1;
1307                 }
1308         }
1309
1310         /* We send an ACK if we can now advertise a non-zero window
1311          * which has been raised "significantly".
1312          *
1313          * Even if window raised up to infinity, do not send window open ACK
1314          * in states, where we will not receive more. It is useless.
1315          */
1316         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1317                 __u32 rcv_window_now = tcp_receive_window(tp);
1318
1319                 /* Optimize, __tcp_select_window() is not cheap. */
1320                 if (2*rcv_window_now <= tp->window_clamp) {
1321                         __u32 new_window = __tcp_select_window(sk);
1322
1323                         /* Send ACK now, if this read freed lots of space
1324                          * in our buffer. Certainly, new_window is new window.
1325                          * We can advertise it now, if it is not less than current one.
1326                          * "Lots" means "at least twice" here.
1327                          */
1328                         if(new_window && new_window >= 2*rcv_window_now)
1329                                 time_to_ack = 1;
1330                 }
1331         }
1332         if (time_to_ack)
1333                 tcp_send_ack(sk);
1334 }
1335
1336 /* Now socket state including sk->err is changed only under lock,
1337  * hence we may omit checks after joining wait queue.
1338  * We check receive queue before schedule() only as optimization;
1339  * it is very likely that release_sock() added new data.
1340  */
1341
1342 static long tcp_data_wait(struct sock *sk, long timeo)
1343 {
1344         DECLARE_WAITQUEUE(wait, current);
1345
1346         add_wait_queue(sk->sleep, &wait);
1347
1348         __set_current_state(TASK_INTERRUPTIBLE);
1349
1350         set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1351         release_sock(sk);
1352
1353         if (skb_queue_empty(&sk->receive_queue))
1354                 timeo = schedule_timeout(timeo);
1355
1356         lock_sock(sk);
1357         clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1358
1359         remove_wait_queue(sk->sleep, &wait);
1360         __set_current_state(TASK_RUNNING);
1361         return timeo;
1362 }
1363
1364 static void tcp_prequeue_process(struct sock *sk)
1365 {
1366         struct sk_buff *skb;
1367         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1368
1369         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1370
1371         /* RX process wants to run with disabled BHs, though it is not necessary */
1372         local_bh_disable();
1373         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1374                 sk->backlog_rcv(sk, skb);
1375         local_bh_enable();
1376
1377         /* Clear memory counter. */
1378         tp->ucopy.memory = 0;
1379 }
1380
1381 static inline
1382 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1383 {
1384         struct sk_buff *skb;
1385         u32 offset;
1386
1387         skb_queue_walk(&sk->receive_queue, skb) {
1388                 offset = seq - TCP_SKB_CB(skb)->seq;
1389                 if (skb->h.th->syn)
1390                         offset--;
1391                 if (offset < skb->len || skb->h.th->fin) {
1392                         *off = offset;
1393                         return skb;
1394                 }
1395         }
1396         return NULL;
1397 }
1398
1399 /*
1400  * This routine provides an alternative to tcp_recvmsg() for routines
1401  * that would like to handle copying from skbuffs directly in 'sendfile'
1402  * fashion.
1403  * Note:
1404  *      - It is assumed that the socket was locked by the caller.
1405  *      - The routine does not block.
1406  *      - At present, there is no support for reading OOB data
1407  *        or for 'peeking' the socket using this routine
1408  *        (although both would be easy to implement).
1409  */
1410 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1411                   sk_read_actor_t recv_actor)
1412 {
1413         struct sk_buff *skb;
1414         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1415         u32 seq = tp->copied_seq;
1416         u32 offset;
1417         int copied = 0;
1418
1419         if (sk->state == TCP_LISTEN)
1420                 return -ENOTCONN;
1421         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1422                 if (offset < skb->len) {
1423                         size_t used, len;
1424
1425                         len = skb->len - offset;
1426                         /* Stop reading if we hit a patch of urgent data */
1427                         if (tp->urg_data) {
1428                                 u32 urg_offset = tp->urg_seq - seq;
1429                                 if (urg_offset < len)
1430                                         len = urg_offset;
1431                                 if (!len)
1432                                         break;
1433                         }
1434                         used = recv_actor(desc, skb, offset, len);
1435                         if (used <= len) {
1436                                 seq += used;
1437                                 copied += used;
1438                                 offset += used;
1439                         }
1440                         if (offset != skb->len)
1441                                 break;
1442                 }
1443                 if (skb->h.th->fin) {
1444                         tcp_eat_skb(sk, skb);
1445                         ++seq;
1446                         break;
1447                 }
1448                 tcp_eat_skb(sk, skb);
1449                 if (!desc->count)
1450                         break;
1451         }
1452         tp->copied_seq = seq;
1453         /* Clean up data we have read: This will do ACK frames. */
1454         if (copied)
1455                 cleanup_rbuf(sk, copied);
1456         return copied;
1457 }
1458
1459 /*
1460  *      This routine copies from a sock struct into the user buffer. 
1461  *
1462  *      Technical note: in 2.3 we work on _locked_ socket, so that
1463  *      tricks with *seq access order and skb->users are not required.
1464  *      Probably, code can be easily improved even more.
1465  */
1466  
1467 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1468                 int len, int nonblock, int flags, int *addr_len)
1469 {
1470         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1471         int copied = 0;
1472         u32 peek_seq;
1473         u32 *seq;
1474         unsigned long used;
1475         int err;
1476         int target;             /* Read at least this many bytes */
1477         long timeo;
1478         struct task_struct *user_recv = NULL;
1479
1480         lock_sock(sk);
1481
1482         TCP_CHECK_TIMER(sk);
1483
1484         err = -ENOTCONN;
1485         if (sk->state == TCP_LISTEN)
1486                 goto out;
1487
1488         timeo = sock_rcvtimeo(sk, nonblock);
1489
1490         /* Urgent data needs to be handled specially. */
1491         if (flags & MSG_OOB)
1492                 goto recv_urg;
1493
1494         seq = &tp->copied_seq;
1495         if (flags & MSG_PEEK) {
1496                 peek_seq = tp->copied_seq;
1497                 seq = &peek_seq;
1498         }
1499
1500         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1501
1502         do {
1503                 struct sk_buff * skb;
1504                 u32 offset;
1505
1506                 /* Are we at urgent data? Stop if we have read anything. */
1507                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1508                         break;
1509
1510                 /* We need to check signals first, to get correct SIGURG
1511                  * handling. FIXME: Need to check this doesn't impact 1003.1g
1512                  * and move it down to the bottom of the loop
1513                  */
1514                 if (signal_pending(current)) {
1515                         if (copied)
1516                                 break;
1517                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1518                         break;
1519                 }
1520
1521                 /* Next get a buffer. */
1522
1523                 skb = skb_peek(&sk->receive_queue);
1524                 do {
1525                         if (!skb)
1526                                 break;
1527
1528                         /* Now that we have two receive queues this 
1529                          * shouldn't happen.
1530                          */
1531                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1532                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1533                                        *seq, TCP_SKB_CB(skb)->seq);
1534                                 break;
1535                         }
1536                         offset = *seq - TCP_SKB_CB(skb)->seq;
1537                         if (skb->h.th->syn)
1538                                 offset--;
1539                         if (offset < skb->len)
1540                                 goto found_ok_skb;
1541                         if (skb->h.th->fin)
1542                                 goto found_fin_ok;
1543                         BUG_TRAP(flags&MSG_PEEK);
1544                         skb = skb->next;
1545                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1546
1547                 /* Well, if we have backlog, try to process it now yet. */
1548
1549                 if (copied >= target && sk->backlog.tail == NULL)
1550                         break;
1551
1552                 if (copied) {
1553                         if (sk->err ||
1554                             sk->state == TCP_CLOSE ||
1555                             (sk->shutdown & RCV_SHUTDOWN) ||
1556                             !timeo ||
1557                             (flags & MSG_PEEK))
1558                                 break;
1559                 } else {
1560                         if (sk->done)
1561                                 break;
1562
1563                         if (sk->err) {
1564                                 copied = sock_error(sk);
1565                                 break;
1566                         }
1567
1568                         if (sk->shutdown & RCV_SHUTDOWN)
1569                                 break;
1570
1571                         if (sk->state == TCP_CLOSE) {
1572                                 if (!sk->done) {
1573                                         /* This occurs when user tries to read
1574                                          * from never connected socket.
1575                                          */
1576                                         copied = -ENOTCONN;
1577                                         break;
1578                                 }
1579                                 break;
1580                         }
1581
1582                         if (!timeo) {
1583                                 copied = -EAGAIN;
1584                                 break;
1585                         }
1586                 }
1587
1588                 cleanup_rbuf(sk, copied);
1589
1590                 if (tp->ucopy.task == user_recv) {
1591                         /* Install new reader */
1592                         if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1593                                 user_recv = current;
1594                                 tp->ucopy.task = user_recv;
1595                                 tp->ucopy.iov = msg->msg_iov;
1596                         }
1597
1598                         tp->ucopy.len = len;
1599
1600                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1601
1602                         /* Ugly... If prequeue is not empty, we have to
1603                          * process it before releasing socket, otherwise
1604                          * order will be broken at second iteration.
1605                          * More elegant solution is required!!!
1606                          *
1607                          * Look: we have the following (pseudo)queues:
1608                          *
1609                          * 1. packets in flight
1610                          * 2. backlog
1611                          * 3. prequeue
1612                          * 4. receive_queue
1613                          *
1614                          * Each queue can be processed only if the next ones
1615                          * are empty. At this point we have empty receive_queue.
1616                          * But prequeue _can_ be not empty after second iteration,
1617                          * when we jumped to start of loop because backlog
1618                          * processing added something to receive_queue.
1619                          * We cannot release_sock(), because backlog contains
1620                          * packets arrived _after_ prequeued ones.
1621                          *
1622                          * Shortly, algorithm is clear --- to process all
1623                          * the queues in order. We could make it more directly,
1624                          * requeueing packets from backlog to prequeue, if
1625                          * is not empty. It is more elegant, but eats cycles,
1626                          * unfortunately.
1627                          */
1628                         if (skb_queue_len(&tp->ucopy.prequeue))
1629                                 goto do_prequeue;
1630
1631                         /* __ Set realtime policy in scheduler __ */
1632                 }
1633
1634                 if (copied >= target) {
1635                         /* Do not sleep, just process backlog. */
1636                         release_sock(sk);
1637                         lock_sock(sk);
1638                 } else {
1639                         timeo = tcp_data_wait(sk, timeo);
1640                 }
1641
1642                 if (user_recv) {
1643                         int chunk;
1644
1645                         /* __ Restore normal policy in scheduler __ */
1646
1647                         if ((chunk = len - tp->ucopy.len) != 0) {
1648                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1649                                 len -= chunk;
1650                                 copied += chunk;
1651                         }
1652
1653                         if (tp->rcv_nxt == tp->copied_seq &&
1654                             skb_queue_len(&tp->ucopy.prequeue)) {
1655 do_prequeue:
1656                                 tcp_prequeue_process(sk);
1657
1658                                 if ((chunk = len - tp->ucopy.len) != 0) {
1659                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1660                                         len -= chunk;
1661                                         copied += chunk;
1662                                 }
1663                         }
1664                 }
1665                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1666                         if (net_ratelimit())
1667                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1668                                        current->comm, current->pid);
1669                         peek_seq = tp->copied_seq;
1670                 }
1671                 continue;
1672
1673         found_ok_skb:
1674                 /* Ok so how much can we use? */
1675                 used = skb->len - offset;
1676                 if (len < used)
1677                         used = len;
1678
1679                 /* Do we have urgent data here? */
1680                 if (tp->urg_data) {
1681                         u32 urg_offset = tp->urg_seq - *seq;
1682                         if (urg_offset < used) {
1683                                 if (!urg_offset) {
1684                                         if (!sk->urginline) {
1685                                                 ++*seq;
1686                                                 offset++;
1687                                                 used--;
1688                                                 if (!used)
1689                                                         goto skip_copy;
1690                                         }
1691                                 } else
1692                                         used = urg_offset;
1693                         }
1694                 }
1695
1696                 if (!(flags&MSG_TRUNC)) {
1697                         err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1698                         if (err) {
1699                                 /* Exception. Bailout! */
1700                                 if (!copied)
1701                                         copied = -EFAULT;
1702                                 break;
1703                         }
1704                 }
1705
1706                 *seq += used;
1707                 copied += used;
1708                 len -= used;
1709
1710 skip_copy:
1711                 if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1712                         tp->urg_data = 0;
1713                         tcp_fast_path_check(sk, tp);
1714                 }
1715                 if (used + offset < skb->len)
1716                         continue;
1717
1718                 if (skb->h.th->fin)
1719                         goto found_fin_ok;
1720                 if (!(flags & MSG_PEEK))
1721                         tcp_eat_skb(sk, skb);
1722                 continue;
1723
1724         found_fin_ok:
1725                 /* Process the FIN. */
1726                 ++*seq;
1727                 if (!(flags & MSG_PEEK))
1728                         tcp_eat_skb(sk, skb);
1729                 break;
1730         } while (len > 0);
1731
1732         if (user_recv) {
1733                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1734                         int chunk;
1735
1736                         tp->ucopy.len = copied > 0 ? len : 0;
1737
1738                         tcp_prequeue_process(sk);
1739
1740                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1741                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1742                                 len -= chunk;
1743                                 copied += chunk;
1744                         }
1745                 }
1746
1747                 tp->ucopy.task = NULL;
1748                 tp->ucopy.len = 0;
1749         }
1750
1751         /* According to UNIX98, msg_name/msg_namelen are ignored
1752          * on connected socket. I was just happy when found this 8) --ANK
1753          */
1754
1755         /* Clean up data we have read: This will do ACK frames. */
1756         cleanup_rbuf(sk, copied);
1757
1758         TCP_CHECK_TIMER(sk);
1759         release_sock(sk);
1760         return copied;
1761
1762 out:
1763         TCP_CHECK_TIMER(sk);
1764         release_sock(sk);
1765         return err;
1766
1767 recv_urg:
1768         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1769         goto out;
1770 }
1771
1772 /*
1773  *      State processing on a close. This implements the state shift for
1774  *      sending our FIN frame. Note that we only send a FIN for some
1775  *      states. A shutdown() may have already sent the FIN, or we may be
1776  *      closed.
1777  */
1778
1779 static unsigned char new_state[16] = {
1780   /* current state:        new state:      action:      */
1781   /* (Invalid)          */ TCP_CLOSE,
1782   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1783   /* TCP_SYN_SENT       */ TCP_CLOSE,
1784   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1785   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1786   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1787   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1788   /* TCP_CLOSE          */ TCP_CLOSE,
1789   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1790   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1791   /* TCP_LISTEN         */ TCP_CLOSE,
1792   /* TCP_CLOSING        */ TCP_CLOSING,
1793 };
1794
1795 static int tcp_close_state(struct sock *sk)
1796 {
1797         int next = (int) new_state[sk->state];
1798         int ns = (next & TCP_STATE_MASK);
1799
1800         tcp_set_state(sk, ns);
1801
1802         return (next & TCP_ACTION_FIN);
1803 }
1804
1805 /*
1806  *      Shutdown the sending side of a connection. Much like close except
1807  *      that we don't receive shut down or set sk->dead.
1808  */
1809
1810 void tcp_shutdown(struct sock *sk, int how)
1811 {
1812         /*      We need to grab some memory, and put together a FIN,
1813          *      and then put it into the queue to be sent.
1814          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1815          */
1816         if (!(how & SEND_SHUTDOWN))
1817                 return;
1818
1819         /* If we've already sent a FIN, or it's a closed state, skip this. */
1820         if ((1 << sk->state) &
1821             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1822                 /* Clear out any half completed packets.  FIN if needed. */
1823                 if (tcp_close_state(sk))
1824                         tcp_send_fin(sk);
1825         }
1826 }
1827
1828
1829 /*
1830  *      Return 1 if we still have things to send in our buffers.
1831  */
1832
1833 static inline int closing(struct sock * sk)
1834 {
1835         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1836 }
1837
1838 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1839 {
1840         /* First the read buffer. */
1841         __skb_queue_purge(&sk->receive_queue);
1842
1843         /* Next, the error queue. */
1844         __skb_queue_purge(&sk->error_queue);
1845
1846         /* Next, the write queue. */
1847         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1848
1849         /* Account for returned memory. */
1850         tcp_mem_reclaim(sk);
1851
1852         BUG_TRAP(sk->wmem_queued == 0);
1853         BUG_TRAP(sk->forward_alloc == 0);
1854
1855         /* It is _impossible_ for the backlog to contain anything
1856          * when we get here.  All user references to this socket
1857          * have gone away, only the net layer knows can touch it.
1858          */
1859 }
1860
1861 /*
1862  * At this point, there should be no process reference to this
1863  * socket, and thus no user references at all.  Therefore we
1864  * can assume the socket waitqueue is inactive and nobody will
1865  * try to jump onto it.
1866  */
1867 void tcp_destroy_sock(struct sock *sk)
1868 {
1869         BUG_TRAP(sk->state==TCP_CLOSE);
1870         BUG_TRAP(sk->dead);
1871
1872         /* It cannot be in hash table! */
1873         BUG_TRAP(sk->pprev==NULL);
1874
1875         /* If it has not 0 sk->num, it must be bound */
1876         BUG_TRAP(!sk->num || sk->prev!=NULL);
1877
1878 #ifdef TCP_DEBUG
1879         if (sk->zapped) {
1880                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1881                 sock_hold(sk);
1882         }
1883         sk->zapped = 1;
1884 #endif
1885
1886         sk->prot->destroy(sk);
1887
1888         tcp_kill_sk_queues(sk);
1889
1890 #ifdef INET_REFCNT_DEBUG
1891         if (atomic_read(&sk->refcnt) != 1) {
1892                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1893         }
1894 #endif
1895
1896         atomic_dec(&tcp_orphan_count);
1897         sock_put(sk);
1898 }
1899
1900 void tcp_close(struct sock *sk, long timeout)
1901 {
1902         struct sk_buff *skb;
1903         int data_was_unread = 0;
1904
1905         lock_sock(sk);
1906         sk->shutdown = SHUTDOWN_MASK;
1907
1908         if(sk->state == TCP_LISTEN) {
1909                 tcp_set_state(sk, TCP_CLOSE);
1910
1911                 /* Special case. */
1912                 tcp_listen_stop(sk);
1913
1914                 goto adjudge_to_death;
1915         }
1916
1917         /*  We need to flush the recv. buffs.  We do this only on the
1918          *  descriptor close, not protocol-sourced closes, because the
1919          *  reader process may not have drained the data yet!
1920          */
1921         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1922                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1923                 data_was_unread += len;
1924                 __kfree_skb(skb);
1925         }
1926
1927         tcp_mem_reclaim(sk);
1928
1929         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1930          * 3.10, we send a RST here because data was lost.  To
1931          * witness the awful effects of the old behavior of always
1932          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1933          * a bulk GET in an FTP client, suspend the process, wait
1934          * for the client to advertise a zero window, then kill -9
1935          * the FTP client, wheee...  Note: timeout is always zero
1936          * in such a case.
1937          */
1938         if(data_was_unread != 0) {
1939                 /* Unread data was tossed, zap the connection. */
1940                 NET_INC_STATS_USER(TCPAbortOnClose);
1941                 tcp_set_state(sk, TCP_CLOSE);
1942                 tcp_send_active_reset(sk, GFP_KERNEL);
1943         } else if (sk->linger && sk->lingertime==0) {
1944                 /* Check zero linger _after_ checking for unread data. */
1945                 sk->prot->disconnect(sk, 0);
1946                 NET_INC_STATS_USER(TCPAbortOnData);
1947         } else if (tcp_close_state(sk)) {
1948                 /* We FIN if the application ate all the data before
1949                  * zapping the connection.
1950                  */
1951
1952                 /* RED-PEN. Formally speaking, we have broken TCP state
1953                  * machine. State transitions:
1954                  *
1955                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1956                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1957                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1958                  *
1959                  * are legal only when FIN has been sent (i.e. in window),
1960                  * rather than queued out of window. Purists blame.
1961                  *
1962                  * F.e. "RFC state" is ESTABLISHED,
1963                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1964                  *
1965                  * The visible declinations are that sometimes
1966                  * we enter time-wait state, when it is not required really
1967                  * (harmless), do not send active resets, when they are
1968                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1969                  * they look as CLOSING or LAST_ACK for Linux)
1970                  * Probably, I missed some more holelets.
1971                  *                                              --ANK
1972                  */
1973                 tcp_send_fin(sk);
1974         }
1975
1976         if (timeout) {
1977                 struct task_struct *tsk = current;
1978                 DECLARE_WAITQUEUE(wait, current);
1979
1980                 add_wait_queue(sk->sleep, &wait);
1981
1982                 do {
1983                         set_current_state(TASK_INTERRUPTIBLE);
1984                         if (!closing(sk))
1985                                 break;
1986                         release_sock(sk);
1987                         timeout = schedule_timeout(timeout);
1988                         lock_sock(sk);
1989                 } while (!signal_pending(tsk) && timeout);
1990
1991                 tsk->state = TASK_RUNNING;
1992                 remove_wait_queue(sk->sleep, &wait);
1993         }
1994
1995 adjudge_to_death:
1996         /* It is the last release_sock in its life. It will remove backlog. */
1997         release_sock(sk);
1998
1999
2000         /* Now socket is owned by kernel and we acquire BH lock
2001            to finish close. No need to check for user refs.
2002          */
2003         local_bh_disable();
2004         bh_lock_sock(sk);
2005         BUG_TRAP(sk->lock.users==0);
2006
2007         sock_hold(sk);
2008         sock_orphan(sk);
2009
2010         /*      This is a (useful) BSD violating of the RFC. There is a
2011          *      problem with TCP as specified in that the other end could
2012          *      keep a socket open forever with no application left this end.
2013          *      We use a 3 minute timeout (about the same as BSD) then kill
2014          *      our end. If they send after that then tough - BUT: long enough
2015          *      that we won't make the old 4*rto = almost no time - whoops
2016          *      reset mistake.
2017          *
2018          *      Nope, it was not mistake. It is really desired behaviour
2019          *      f.e. on http servers, when such sockets are useless, but
2020          *      consume significant resources. Let's do it with special
2021          *      linger2 option.                                 --ANK
2022          */
2023
2024         if (sk->state == TCP_FIN_WAIT2) {
2025                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2026                 if (tp->linger2 < 0) {
2027                         tcp_set_state(sk, TCP_CLOSE);
2028                         tcp_send_active_reset(sk, GFP_ATOMIC);
2029                         NET_INC_STATS_BH(TCPAbortOnLinger);
2030                 } else {
2031                         int tmo = tcp_fin_time(tp);
2032
2033                         if (tmo > TCP_TIMEWAIT_LEN) {
2034                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2035                         } else {
2036                                 atomic_inc(&tcp_orphan_count);
2037                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2038                                 goto out;
2039                         }
2040                 }
2041         }
2042         if (sk->state != TCP_CLOSE) {
2043                 tcp_mem_reclaim(sk);
2044                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2045                     (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2046                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2047                         if (net_ratelimit())
2048                                 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2049                         tcp_set_state(sk, TCP_CLOSE);
2050                         tcp_send_active_reset(sk, GFP_ATOMIC);
2051                         NET_INC_STATS_BH(TCPAbortOnMemory);
2052                 }
2053         }
2054         atomic_inc(&tcp_orphan_count);
2055
2056         if (sk->state == TCP_CLOSE)
2057                 tcp_destroy_sock(sk);
2058         /* Otherwise, socket is reprieved until protocol close. */
2059
2060 out:
2061         bh_unlock_sock(sk);
2062         local_bh_enable();
2063         sock_put(sk);
2064 }
2065
2066 /* These states need RST on ABORT according to RFC793 */
2067
2068 extern __inline__ int tcp_need_reset(int state)
2069 {
2070         return ((1 << state) &
2071                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2072                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2073 }
2074
2075 int tcp_disconnect(struct sock *sk, int flags)
2076 {
2077         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2078         int old_state;
2079         int err = 0;
2080
2081         old_state = sk->state;
2082         if (old_state != TCP_CLOSE)
2083                 tcp_set_state(sk, TCP_CLOSE);
2084
2085         /* ABORT function of RFC793 */
2086         if (old_state == TCP_LISTEN) {
2087                 tcp_listen_stop(sk);
2088         } else if (tcp_need_reset(old_state) ||
2089                    (tp->snd_nxt != tp->write_seq &&
2090                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2091                 /* The last check adjusts for discrepance of Linux wrt. RFC
2092                  * states
2093                  */
2094                 tcp_send_active_reset(sk, gfp_any());
2095                 sk->err = ECONNRESET;
2096         } else if (old_state == TCP_SYN_SENT)
2097                 sk->err = ECONNRESET;
2098
2099         tcp_clear_xmit_timers(sk);
2100         __skb_queue_purge(&sk->receive_queue);
2101         tcp_writequeue_purge(sk);
2102         __skb_queue_purge(&tp->out_of_order_queue);
2103
2104         sk->dport = 0;
2105
2106         if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2107                 sk->rcv_saddr = 0;
2108                 sk->saddr = 0;
2109 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2110                 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2111                 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2112 #endif
2113         }
2114
2115         sk->shutdown = 0;
2116         sk->done = 0;
2117         tp->srtt = 0;
2118         if ((tp->write_seq += tp->max_window+2) == 0)
2119                 tp->write_seq = 1;
2120         tp->backoff = 0;
2121         tp->snd_cwnd = 2;
2122         tp->probes_out = 0;
2123         tp->packets_out = 0;
2124         tp->snd_ssthresh = 0x7fffffff;
2125         tp->snd_cwnd_cnt = 0;
2126         tp->ca_state = TCP_CA_Open;
2127         tcp_clear_retrans(tp);
2128         tcp_delack_init(tp);
2129         tp->send_head = NULL;
2130         tp->saw_tstamp = 0;
2131         tcp_sack_reset(tp);
2132         __sk_dst_reset(sk);
2133
2134         BUG_TRAP(!sk->num || sk->prev);
2135
2136         sk->error_report(sk);
2137         return err;
2138 }
2139
2140 /*
2141  *      Wait for an incoming connection, avoid race
2142  *      conditions. This must be called with the socket locked.
2143  */
2144 static int wait_for_connect(struct sock * sk, long timeo)
2145 {
2146         DECLARE_WAITQUEUE(wait, current);
2147         int err;
2148
2149         /*
2150          * True wake-one mechanism for incoming connections: only
2151          * one process gets woken up, not the 'whole herd'.
2152          * Since we do not 'race & poll' for established sockets
2153          * anymore, the common case will execute the loop only once.
2154          *
2155          * Subtle issue: "add_wait_queue_exclusive()" will be added
2156          * after any current non-exclusive waiters, and we know that
2157          * it will always _stay_ after any new non-exclusive waiters
2158          * because all non-exclusive waiters are added at the
2159          * beginning of the wait-queue. As such, it's ok to "drop"
2160          * our exclusiveness temporarily when we get woken up without
2161          * having to remove and re-insert us on the wait queue.
2162          */
2163         add_wait_queue_exclusive(sk->sleep, &wait);
2164         for (;;) {
2165                 current->state = TASK_INTERRUPTIBLE;
2166                 release_sock(sk);
2167                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2168                         timeo = schedule_timeout(timeo);
2169                 lock_sock(sk);
2170                 err = 0;
2171                 if (sk->tp_pinfo.af_tcp.accept_queue)
2172                         break;
2173                 err = -EINVAL;
2174                 if (sk->state != TCP_LISTEN)
2175                         break;
2176                 err = sock_intr_errno(timeo);
2177                 if (signal_pending(current))
2178                         break;
2179                 err = -EAGAIN;
2180                 if (!timeo)
2181                         break;
2182         }
2183         current->state = TASK_RUNNING;
2184         remove_wait_queue(sk->sleep, &wait);
2185         return err;
2186 }
2187
2188 /*
2189  *      This will accept the next outstanding connection.
2190  */
2191
2192 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2193 {
2194         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2195         struct open_request *req;
2196         struct sock *newsk;
2197         int error;
2198
2199         lock_sock(sk); 
2200
2201         /* We need to make sure that this socket is listening,
2202          * and that it has something pending.
2203          */
2204         error = -EINVAL;
2205         if (sk->state != TCP_LISTEN)
2206                 goto out;
2207
2208         /* Find already established connection */
2209         if (!tp->accept_queue) {
2210                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2211
2212                 /* If this is a non blocking socket don't sleep */
2213                 error = -EAGAIN;
2214                 if (!timeo)
2215                         goto out;
2216
2217                 error = wait_for_connect(sk, timeo);
2218                 if (error)
2219                         goto out;
2220         }
2221
2222         req = tp->accept_queue;
2223         if ((tp->accept_queue = req->dl_next) == NULL)
2224                 tp->accept_queue_tail = NULL;
2225
2226         newsk = req->sk;
2227         tcp_acceptq_removed(sk);
2228         tcp_openreq_fastfree(req);
2229         BUG_TRAP(newsk->state != TCP_SYN_RECV);
2230         release_sock(sk);
2231         return newsk;
2232
2233 out:
2234         release_sock(sk);
2235         *err = error; 
2236         return NULL;
2237 }
2238
2239 /*
2240  *      Socket option code for TCP. 
2241  */
2242   
2243 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, 
2244                    int optlen)
2245 {
2246         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2247         int val;
2248         int err = 0;
2249
2250         if (level != SOL_TCP)
2251                 return tp->af_specific->setsockopt(sk, level, optname, 
2252                                                    optval, optlen);
2253
2254         if(optlen<sizeof(int))
2255                 return -EINVAL;
2256
2257         if (get_user(val, (int *)optval))
2258                 return -EFAULT;
2259
2260         lock_sock(sk);
2261
2262         switch(optname) {
2263         case TCP_MAXSEG:
2264                 /* values greater than interface MTU won't take effect.  however at
2265                  * the point when this call is done we typically don't yet know
2266                  * which interface is going to be used
2267                  */
2268                 if(val < 8 || val > MAX_TCP_WINDOW) {
2269                         err = -EINVAL;
2270                         break;
2271                 }
2272                 tp->user_mss = val;
2273                 break;
2274
2275         case TCP_NODELAY:
2276                 /* You cannot try to use this and TCP_CORK in
2277                  * tandem, so let the user know.
2278                  */
2279                 if (tp->nonagle == 2) {
2280                         err = -EINVAL;
2281                         break;
2282                 }
2283                 tp->nonagle = (val == 0) ? 0 : 1;
2284                 if (val)
2285                         tcp_push_pending_frames(sk, tp);
2286                 break;
2287
2288         case TCP_CORK:
2289                 /* When set indicates to always queue non-full frames.
2290                  * Later the user clears this option and we transmit
2291                  * any pending partial frames in the queue.  This is
2292                  * meant to be used alongside sendfile() to get properly
2293                  * filled frames when the user (for example) must write
2294                  * out headers with a write() call first and then use
2295                  * sendfile to send out the data parts.
2296                  *
2297                  * You cannot try to use TCP_NODELAY and this mechanism
2298                  * at the same time, so let the user know.
2299                  */
2300                 if (tp->nonagle == 1) {
2301                         err = -EINVAL;
2302                         break;
2303                 }
2304                 if (val != 0) {
2305                         tp->nonagle = 2;
2306                 } else {
2307                         tp->nonagle = 0;
2308
2309                         tcp_push_pending_frames(sk, tp);
2310                 }
2311                 break;
2312                 
2313         case TCP_KEEPIDLE:
2314                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2315                         err = -EINVAL;
2316                 else {
2317                         tp->keepalive_time = val * HZ;
2318                         if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2319                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2320                                 if (tp->keepalive_time > elapsed)
2321                                         elapsed = tp->keepalive_time - elapsed;
2322                                 else
2323                                         elapsed = 0;
2324                                 tcp_reset_keepalive_timer(sk, elapsed);
2325                         }
2326                 }
2327                 break;
2328         case TCP_KEEPINTVL:
2329                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2330                         err = -EINVAL;
2331                 else
2332                         tp->keepalive_intvl = val * HZ;
2333                 break;
2334         case TCP_KEEPCNT:
2335                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2336                         err = -EINVAL;
2337                 else
2338                         tp->keepalive_probes = val;
2339                 break;
2340         case TCP_SYNCNT:
2341                 if (val < 1 || val > MAX_TCP_SYNCNT)
2342                         err = -EINVAL;
2343                 else
2344                         tp->syn_retries = val;
2345                 break;
2346
2347         case TCP_LINGER2:
2348                 if (val < 0)
2349                         tp->linger2 = -1;
2350                 else if (val > sysctl_tcp_fin_timeout/HZ)
2351                         tp->linger2 = 0;
2352                 else
2353                         tp->linger2 = val*HZ;
2354                 break;
2355
2356         case TCP_DEFER_ACCEPT:
2357                 tp->defer_accept = 0;
2358                 if (val > 0) {
2359                         /* Translate value in seconds to number of retransmits */
2360                         while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2361                                 tp->defer_accept++;
2362                         tp->defer_accept++;
2363                 }
2364                 break;
2365
2366         case TCP_WINDOW_CLAMP:
2367                 if (val==0) {
2368                         if (sk->state != TCP_CLOSE) {
2369                                 err = -EINVAL;
2370                                 break;
2371                         }
2372                         tp->window_clamp = 0;
2373                 } else {
2374                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2375                                 SOCK_MIN_RCVBUF/2 : val;
2376                 }
2377                 break;
2378
2379         case TCP_QUICKACK:
2380                 if (!val) {
2381                         tp->ack.pingpong = 1;
2382                 } else {
2383                         tp->ack.pingpong = 0;
2384                         if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2385                             tcp_ack_scheduled(tp)) {
2386                                 tp->ack.pending |= TCP_ACK_PUSHED;
2387                                 cleanup_rbuf(sk, 1);
2388                                 if (!(val & 1))
2389                                         tp->ack.pingpong = 1;
2390                         }
2391                 }
2392                 break;
2393
2394         default:
2395                 err = -ENOPROTOOPT;
2396                 break;
2397         };
2398         release_sock(sk);
2399         return err;
2400 }
2401
2402 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2403                    int *optlen)
2404 {
2405         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2406         int val, len;
2407
2408         if(level != SOL_TCP)
2409                 return tp->af_specific->getsockopt(sk, level, optname,
2410                                                    optval, optlen);
2411
2412         if(get_user(len,optlen))
2413                 return -EFAULT;
2414
2415         len = min_t(unsigned int, len, sizeof(int));
2416         
2417         if(len < 0)
2418                 return -EINVAL;
2419
2420         switch(optname) {
2421         case TCP_MAXSEG:
2422                 val = tp->mss_cache;
2423                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2424                         val = tp->user_mss;
2425                 break;
2426         case TCP_NODELAY:
2427                 val = (tp->nonagle == 1);
2428                 break;
2429         case TCP_CORK:
2430                 val = (tp->nonagle == 2);
2431                 break;
2432         case TCP_KEEPIDLE:
2433                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2434                 break;
2435         case TCP_KEEPINTVL:
2436                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2437                 break;
2438         case TCP_KEEPCNT:
2439                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2440                 break;
2441         case TCP_SYNCNT:
2442                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2443                 break;
2444         case TCP_LINGER2:
2445                 val = tp->linger2;
2446                 if (val >= 0)
2447                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2448                 break;
2449         case TCP_DEFER_ACCEPT:
2450                 val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2451                 break;
2452         case TCP_WINDOW_CLAMP:
2453                 val = tp->window_clamp;
2454                 break;
2455         case TCP_INFO:
2456         {
2457                 struct tcp_info info;
2458                 u32 now = tcp_time_stamp;
2459
2460                 if(get_user(len,optlen))
2461                         return -EFAULT;
2462                 info.tcpi_state = sk->state;
2463                 info.tcpi_ca_state = tp->ca_state;
2464                 info.tcpi_retransmits = tp->retransmits;
2465                 info.tcpi_probes = tp->probes_out;
2466                 info.tcpi_backoff = tp->backoff;
2467                 info.tcpi_options = 0;
2468                 if (tp->tstamp_ok)
2469                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2470                 if (tp->sack_ok)
2471                         info.tcpi_options |= TCPI_OPT_SACK;
2472                 if (tp->wscale_ok) {
2473                         info.tcpi_options |= TCPI_OPT_WSCALE;
2474                         info.tcpi_snd_wscale = tp->snd_wscale;
2475                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2476                 } else {
2477                         info.tcpi_snd_wscale = 0;
2478                         info.tcpi_rcv_wscale = 0;
2479                 }
2480                 if (tp->ecn_flags&TCP_ECN_OK)
2481                         info.tcpi_options |= TCPI_OPT_ECN;
2482
2483                 info.tcpi_rto = (1000000*tp->rto)/HZ;
2484                 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2485                 info.tcpi_snd_mss = tp->mss_cache;
2486                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2487
2488                 info.tcpi_unacked = tp->packets_out;
2489                 info.tcpi_sacked = tp->sacked_out;
2490                 info.tcpi_lost = tp->lost_out;
2491                 info.tcpi_retrans = tp->retrans_out;
2492                 info.tcpi_fackets = tp->fackets_out;
2493
2494                 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2495                 info.tcpi_last_ack_sent = 0;
2496                 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2497                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2498
2499                 info.tcpi_pmtu = tp->pmtu_cookie;
2500                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2501                 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2502                 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2503                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2504                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2505                 info.tcpi_advmss = tp->advmss;
2506                 info.tcpi_reordering = tp->reordering;
2507
2508                 len = min_t(unsigned int, len, sizeof(info));
2509                 if(put_user(len, optlen))
2510                         return -EFAULT;
2511                 if(copy_to_user(optval, &info,len))
2512                         return -EFAULT;
2513                 return 0;
2514         }
2515         case TCP_QUICKACK:
2516                 val = !tp->ack.pingpong;
2517                 break;
2518         default:
2519                 return -ENOPROTOOPT;
2520         };
2521
2522         if(put_user(len, optlen))
2523                 return -EFAULT;
2524         if(copy_to_user(optval, &val,len))
2525                 return -EFAULT;
2526         return 0;
2527 }
2528
2529
2530 extern void __skb_cb_too_small_for_tcp(int, int);
2531 extern void tcpdiag_init(void);
2532
2533 void __init tcp_init(void)
2534 {
2535         struct sk_buff *skb = NULL;
2536         unsigned long goal;
2537         int order, i;
2538
2539         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2540                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2541                                            sizeof(skb->cb));
2542
2543         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2544                                                    sizeof(struct open_request),
2545                                                0, SLAB_HWCACHE_ALIGN,
2546                                                NULL, NULL);
2547         if(!tcp_openreq_cachep)
2548                 panic("tcp_init: Cannot alloc open_request cache.");
2549
2550         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2551                                               sizeof(struct tcp_bind_bucket),
2552                                               0, SLAB_HWCACHE_ALIGN,
2553                                               NULL, NULL);
2554         if(!tcp_bucket_cachep)
2555                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2556
2557         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2558                                                 sizeof(struct tcp_tw_bucket),
2559                                                 0, SLAB_HWCACHE_ALIGN,
2560                                                 NULL, NULL);
2561         if(!tcp_timewait_cachep)
2562                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2563
2564         /* Size and allocate the main established and bind bucket
2565          * hash tables.
2566          *
2567          * The methodology is similar to that of the buffer cache.
2568          */
2569         if (num_physpages >= (128 * 1024))
2570                 goal = num_physpages >> (21 - PAGE_SHIFT);
2571         else
2572                 goal = num_physpages >> (23 - PAGE_SHIFT);
2573
2574         for(order = 0; (1UL << order) < goal; order++)
2575                 ;
2576         do {
2577                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2578                         sizeof(struct tcp_ehash_bucket);
2579                 tcp_ehash_size >>= 1;
2580                 while (tcp_ehash_size & (tcp_ehash_size-1))
2581                         tcp_ehash_size--;
2582                 tcp_ehash = (struct tcp_ehash_bucket *)
2583                         __get_free_pages(GFP_ATOMIC, order);
2584         } while (tcp_ehash == NULL && --order > 0);
2585
2586         if (!tcp_ehash)
2587                 panic("Failed to allocate TCP established hash table\n");
2588         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2589                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2590                 tcp_ehash[i].chain = NULL;
2591         }
2592
2593         do {
2594                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2595                         sizeof(struct tcp_bind_hashbucket);
2596                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2597                         continue;
2598                 tcp_bhash = (struct tcp_bind_hashbucket *)
2599                         __get_free_pages(GFP_ATOMIC, order);
2600         } while (tcp_bhash == NULL && --order >= 0);
2601
2602         if (!tcp_bhash)
2603                 panic("Failed to allocate TCP bind hash table\n");
2604         for (i = 0; i < tcp_bhash_size; i++) {
2605                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2606                 tcp_bhash[i].chain = NULL;
2607         }
2608
2609         /* Try to be a bit smarter and adjust defaults depending
2610          * on available memory.
2611          */
2612         if (order > 4) {
2613                 sysctl_local_port_range[0] = 32768;
2614                 sysctl_local_port_range[1] = 61000;
2615                 sysctl_tcp_max_tw_buckets = 180000;
2616                 sysctl_tcp_max_orphans = 4096<<(order-4);
2617                 sysctl_max_syn_backlog = 1024;
2618         } else if (order < 3) {
2619                 sysctl_local_port_range[0] = 1024*(3-order);
2620                 sysctl_tcp_max_tw_buckets >>= (3-order);
2621                 sysctl_tcp_max_orphans >>= (3-order);
2622                 sysctl_max_syn_backlog = 128;
2623         }
2624         tcp_port_rover = sysctl_local_port_range[0] - 1;
2625
2626         sysctl_tcp_mem[0] = 768<<order;
2627         sysctl_tcp_mem[1] = 1024<<order;
2628         sysctl_tcp_mem[2] = 1536<<order;
2629         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2630                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2631         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2632                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2633
2634         if (order < 3) {
2635                 sysctl_tcp_wmem[2] = 64*1024;
2636                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2637                 sysctl_tcp_rmem[1] = 43689;
2638                 sysctl_tcp_rmem[2] = 2*43689;
2639         }
2640
2641         printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2642                tcp_ehash_size<<1, tcp_bhash_size);
2643
2644         tcpdiag_init();
2645 }