brute-forced more changes from MontaVista's tree. SCSI partition table read still...
[linux-2.4.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.215 2001/10/31 08:17:58 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but its a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *                                      
208  *              This program is free software; you can redistribute it and/or
209  *              modify it under the terms of the GNU General Public License
210  *              as published by the Free Software Foundation; either version
211  *              2 of the License, or(at your option) any later version.
212  *
213  * Description of States:
214  *
215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
216  *
217  *      TCP_SYN_RECV            received a connection request, sent ack,
218  *                              waiting for final ack in three-way handshake.
219  *
220  *      TCP_ESTABLISHED         connection established
221  *
222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
223  *                              transmission of remaining buffered data
224  *
225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
226  *                              to shutdown
227  *
228  *      TCP_CLOSING             both sides have shutdown but we still have
229  *                              data we have to finish sending
230  *
231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
232  *                              closed, can only be entered from FIN_WAIT2
233  *                              or CLOSING.  Required because the other end
234  *                              may not have gotten our last ACK causing it
235  *                              to retransmit the data packet (which we ignore)
236  *
237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
238  *                              us to finish writing our data and to shutdown
239  *                              (we have to close() to move on to LAST_ACK)
240  *
241  *      TCP_LAST_ACK            out side has shutdown after remote has
242  *                              shutdown.  There may still be data in our
243  *                              buffer that we have to finish sending
244  *
245  *      TCP_CLOSE               socket is finished
246  */
247
248 #include <linux/config.h>
249 #include <linux/types.h>
250 #include <linux/fcntl.h>
251 #include <linux/poll.h>
252 #include <linux/init.h>
253 #include <linux/smp_lock.h>
254 #include <linux/fs.h>
255 #include <linux/random.h>
256
257 #include <net/icmp.h>
258 #include <net/tcp.h>
259
260 #include <asm/uaccess.h>
261 #include <asm/ioctls.h>
262
263 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
264
265 struct tcp_mib  tcp_statistics[NR_CPUS*2];
266
267 kmem_cache_t *tcp_openreq_cachep;
268 kmem_cache_t *tcp_bucket_cachep;
269 kmem_cache_t *tcp_timewait_cachep;
270
271 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
272
273 int sysctl_tcp_default_win_scale = 0;
274
275 int sysctl_tcp_mem[3];
276 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
277 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
278
279 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
280 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
281
282 /* Pressure flag: try to collapse.
283  * Technical note: it is used by multiple contexts non atomically.
284  * All the tcp_mem_schedule() is of this nature: accounting
285  * is strict, actions are advisory and have some latency. */
286 int tcp_memory_pressure;
287
288 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
289
290 int tcp_mem_schedule(struct sock *sk, int size, int kind)
291 {
292         int amt = TCP_PAGES(size);
293
294         sk->forward_alloc += amt*TCP_MEM_QUANTUM;
295         atomic_add(amt, &tcp_memory_allocated);
296
297         /* Under limit. */
298         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
299                 if (tcp_memory_pressure)
300                         tcp_memory_pressure = 0;
301                 return 1;
302         }
303
304         /* Over hard limit. */
305         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
306                 tcp_enter_memory_pressure();
307                 goto suppress_allocation;
308         }
309
310         /* Under pressure. */
311         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
312                 tcp_enter_memory_pressure();
313
314         if (kind) {
315                 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
316                         return 1;
317         } else {
318                 if (sk->wmem_queued < sysctl_tcp_wmem[0])
319                         return 1;
320         }
321
322         if (!tcp_memory_pressure ||
323             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
324             * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
325                         sk->forward_alloc))
326                 return 1;
327
328 suppress_allocation:
329
330         if (kind == 0) {
331                 tcp_moderate_sndbuf(sk);
332
333                 /* Fail only if socket is _under_ its sndbuf.
334                  * In this case we cannot block, so that we have to fail.
335                  */
336                 if (sk->wmem_queued+size >= sk->sndbuf)
337                         return 1;
338         }
339
340         /* Alas. Undo changes. */
341         sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
342         atomic_sub(amt, &tcp_memory_allocated);
343         return 0;
344 }
345
346 void __tcp_mem_reclaim(struct sock *sk)
347 {
348         if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
349                 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
350                 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
351                 if (tcp_memory_pressure &&
352                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
353                         tcp_memory_pressure = 0;
354         }
355 }
356
357 void tcp_rfree(struct sk_buff *skb)
358 {
359         struct sock *sk = skb->sk;
360
361         atomic_sub(skb->truesize, &sk->rmem_alloc);
362         sk->forward_alloc += skb->truesize;
363 }
364
365 /*
366  * LISTEN is a special case for poll..
367  */
368 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
369 {
370         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
371 }
372
373 /*
374  *      Wait for a TCP event.
375  *
376  *      Note that we don't need to lock the socket, as the upper poll layers
377  *      take care of normal races (between the test and the event) and we don't
378  *      go look at any of the socket buffers directly.
379  */
380 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
381 {
382         unsigned int mask;
383         struct sock *sk = sock->sk;
384         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
385
386         poll_wait(file, sk->sleep, wait);
387         if (sk->state == TCP_LISTEN)
388                 return tcp_listen_poll(sk, wait);
389
390         /* Socket is not locked. We are protected from async events
391            by poll logic and correct handling of state changes
392            made by another threads is impossible in any case.
393          */
394
395         mask = 0;
396         if (sk->err)
397                 mask = POLLERR;
398
399         /*
400          * POLLHUP is certainly not done right. But poll() doesn't
401          * have a notion of HUP in just one direction, and for a
402          * socket the read side is more interesting.
403          *
404          * Some poll() documentation says that POLLHUP is incompatible
405          * with the POLLOUT/POLLWR flags, so somebody should check this
406          * all. But careful, it tends to be safer to return too many
407          * bits than too few, and you can easily break real applications
408          * if you don't tell them that something has hung up!
409          *
410          * Check-me.
411          *
412          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
413          * our fs/select.c). It means that after we received EOF,
414          * poll always returns immediately, making impossible poll() on write()
415          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
416          * if and only if shutdown has been made in both directions.
417          * Actually, it is interesting to look how Solaris and DUX
418          * solve this dilemma. I would prefer, if PULLHUP were maskable,
419          * then we could set it on SND_SHUTDOWN. BTW examples given
420          * in Stevens' books assume exactly this behaviour, it explains
421          * why PULLHUP is incompatible with POLLOUT.    --ANK
422          *
423          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
424          * blocking on fresh not-connected or disconnected socket. --ANK
425          */
426         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
427                 mask |= POLLHUP;
428         if (sk->shutdown & RCV_SHUTDOWN)
429                 mask |= POLLIN | POLLRDNORM;
430
431         /* Connected? */
432         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
433                 /* Potential race condition. If read of tp below will
434                  * escape above sk->state, we can be illegally awaken
435                  * in SYN_* states. */
436                 if ((tp->rcv_nxt != tp->copied_seq) &&
437                     (tp->urg_seq != tp->copied_seq ||
438                      tp->rcv_nxt != tp->copied_seq+1 ||
439                      sk->urginline || !tp->urg_data))
440                         mask |= POLLIN | POLLRDNORM;
441
442                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
443                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
444                                 mask |= POLLOUT | POLLWRNORM;
445                         } else {  /* send SIGIO later */
446                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
447                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
448
449                                 /* Race breaker. If space is freed after
450                                  * wspace test but before the flags are set,
451                                  * IO signal will be lost.
452                                  */
453                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
454                                         mask |= POLLOUT | POLLWRNORM;
455                         }
456                 }
457
458                 if (tp->urg_data & TCP_URG_VALID)
459                         mask |= POLLPRI;
460         }
461         return mask;
462 }
463
464 /*
465  *      TCP socket write_space callback.
466  */
467 void tcp_write_space(struct sock *sk)
468 {
469         struct socket *sock = sk->socket;
470
471         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
472                 clear_bit(SOCK_NOSPACE, &sock->flags);
473
474                 if (sk->sleep && waitqueue_active(sk->sleep))
475                         wake_up_interruptible(sk->sleep);
476
477                 if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
478                         sock_wake_async(sock, 2, POLL_OUT);
479         }
480 }
481
482 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
483 {
484         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
485         int answ;
486
487         switch(cmd) {
488         case SIOCINQ:
489                 if (sk->state == TCP_LISTEN)
490                         return(-EINVAL);
491
492                 lock_sock(sk);
493                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
494                         answ = 0;
495                 else if (sk->urginline || !tp->urg_data ||
496                          before(tp->urg_seq,tp->copied_seq) ||
497                          !before(tp->urg_seq,tp->rcv_nxt)) {
498                         answ = tp->rcv_nxt - tp->copied_seq;
499
500                         /* Subtract 1, if FIN is in queue. */
501                         if (answ && !skb_queue_empty(&sk->receive_queue))
502                                 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
503                 } else
504                         answ = tp->urg_seq - tp->copied_seq;
505                 release_sock(sk);
506                 break;
507         case SIOCATMARK:
508                 {
509                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
510                         break;
511                 }
512         case SIOCOUTQ:
513                 if (sk->state == TCP_LISTEN)
514                         return(-EINVAL);
515
516                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
517                         answ = 0;
518                 else
519                         answ = tp->write_seq - tp->snd_una;
520                 break;
521         default:
522                 return(-ENOIOCTLCMD);
523         };
524
525         return put_user(answ, (int *)arg);
526 }
527
528
529 int tcp_listen_start(struct sock *sk)
530 {
531         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
532         struct tcp_listen_opt *lopt;
533
534         sk->max_ack_backlog = 0;
535         sk->ack_backlog = 0;
536         tp->accept_queue = tp->accept_queue_tail = NULL;
537         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
538         tcp_delack_init(tp);
539
540         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
541         if (!lopt)
542                 return -ENOMEM;
543
544         memset(lopt, 0, sizeof(struct tcp_listen_opt));
545         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
546                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
547                         break;
548         get_random_bytes(&lopt->hash_rnd, 4);
549
550         write_lock_bh(&tp->syn_wait_lock);
551         tp->listen_opt = lopt;
552         write_unlock_bh(&tp->syn_wait_lock);
553
554         /* There is race window here: we announce ourselves listening,
555          * but this transition is still not validated by get_port().
556          * It is OK, because this socket enters to hash table only
557          * after validation is complete.
558          */
559         sk->state = TCP_LISTEN;
560         if (sk->prot->get_port(sk, sk->num) == 0) {
561                 sk->sport = htons(sk->num);
562
563                 sk_dst_reset(sk);
564                 sk->prot->hash(sk);
565
566                 return 0;
567         }
568
569         sk->state = TCP_CLOSE;
570         write_lock_bh(&tp->syn_wait_lock);
571         tp->listen_opt = NULL;
572         write_unlock_bh(&tp->syn_wait_lock);
573         kfree(lopt);
574         return -EADDRINUSE;
575 }
576
577 /*
578  *      This routine closes sockets which have been at least partially
579  *      opened, but not yet accepted.
580  */
581
582 static void tcp_listen_stop (struct sock *sk)
583 {
584         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
585         struct tcp_listen_opt *lopt = tp->listen_opt;
586         struct open_request *acc_req = tp->accept_queue;
587         struct open_request *req;
588         int i;
589
590         tcp_delete_keepalive_timer(sk);
591
592         /* make all the listen_opt local to us */
593         write_lock_bh(&tp->syn_wait_lock);
594         tp->listen_opt =NULL;
595         write_unlock_bh(&tp->syn_wait_lock);
596         tp->accept_queue = tp->accept_queue_tail = NULL;
597
598         if (lopt->qlen) {
599                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
600                         while ((req = lopt->syn_table[i]) != NULL) {
601                                 lopt->syn_table[i] = req->dl_next;
602                                 lopt->qlen--;
603                                 tcp_openreq_free(req);
604
605                 /* Following specs, it would be better either to send FIN
606                  * (and enter FIN-WAIT-1, it is normal close)
607                  * or to send active reset (abort). 
608                  * Certainly, it is pretty dangerous while synflood, but it is
609                  * bad justification for our negligence 8)
610                  * To be honest, we are not able to make either
611                  * of the variants now.                 --ANK
612                  */
613                         }
614                 }
615         }
616         BUG_TRAP(lopt->qlen == 0);
617
618         kfree(lopt);
619
620         while ((req=acc_req) != NULL) {
621                 struct sock *child = req->sk;
622
623                 acc_req = req->dl_next;
624
625                 local_bh_disable();
626                 bh_lock_sock(child);
627                 BUG_TRAP(child->lock.users==0);
628                 sock_hold(child);
629
630                 tcp_disconnect(child, O_NONBLOCK);
631
632                 sock_orphan(child);
633
634                 atomic_inc(&tcp_orphan_count);
635
636                 tcp_destroy_sock(child);
637
638                 bh_unlock_sock(child);
639                 local_bh_enable();
640                 sock_put(child);
641
642                 tcp_acceptq_removed(sk);
643                 tcp_openreq_fastfree(req);
644         }
645         BUG_TRAP(sk->ack_backlog == 0);
646 }
647
648 /*
649  *      Wait for a socket to get into the connected state
650  *
651  *      Note: Must be called with the socket locked.
652  */
653 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
654 {
655         struct task_struct *tsk = current;
656         DECLARE_WAITQUEUE(wait, tsk);
657
658         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
659                 if(sk->err)
660                         return sock_error(sk);
661                 if((1 << sk->state) &
662                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
663                         return -EPIPE;
664                 if(!*timeo_p)
665                         return -EAGAIN;
666                 if(signal_pending(tsk))
667                         return sock_intr_errno(*timeo_p);
668
669                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
670                 add_wait_queue(sk->sleep, &wait);
671                 sk->tp_pinfo.af_tcp.write_pending++;
672
673                 release_sock(sk);
674                 *timeo_p = schedule_timeout(*timeo_p);
675                 lock_sock(sk);
676
677                 __set_task_state(tsk, TASK_RUNNING);
678                 remove_wait_queue(sk->sleep, &wait);
679                 sk->tp_pinfo.af_tcp.write_pending--;
680         }
681         return 0;
682 }
683
684 static inline int tcp_memory_free(struct sock *sk)
685 {
686         return sk->wmem_queued < sk->sndbuf;
687 }
688
689 /*
690  *      Wait for more memory for a socket
691  */
692 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
693 {
694         int err = 0;
695         long vm_wait = 0;
696         long current_timeo = *timeo;
697         DECLARE_WAITQUEUE(wait, current);
698
699         if (tcp_memory_free(sk))
700                 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
701
702         add_wait_queue(sk->sleep, &wait);
703         for (;;) {
704                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
705
706                 set_current_state(TASK_INTERRUPTIBLE);
707
708                 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
709                         goto do_error;
710                 if (!*timeo)
711                         goto do_nonblock;
712                 if (signal_pending(current))
713                         goto do_interrupted;
714                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
715                 if (tcp_memory_free(sk) && !vm_wait)
716                         break;
717
718                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
719                 sk->tp_pinfo.af_tcp.write_pending++;
720                 release_sock(sk);
721                 if (!tcp_memory_free(sk) || vm_wait)
722                         current_timeo = schedule_timeout(current_timeo);
723                 lock_sock(sk);
724                 sk->tp_pinfo.af_tcp.write_pending--;
725
726                 if (vm_wait) {
727                         vm_wait -= current_timeo;
728                         current_timeo = *timeo;
729                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
730                             (current_timeo -= vm_wait) < 0)
731                                 current_timeo = 0;
732                         vm_wait = 0;
733                 }
734                 *timeo = current_timeo;
735         }
736 out:
737         current->state = TASK_RUNNING;
738         remove_wait_queue(sk->sleep, &wait);
739         return err;
740
741 do_error:
742         err = -EPIPE;
743         goto out;
744 do_nonblock:
745         err = -EAGAIN;
746         goto out;
747 do_interrupted:
748         err = sock_intr_errno(*timeo);
749         goto out;
750 }
751
752 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
753
754 static inline int
755 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
756 {
757         if (i) {
758                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
759                 return page == frag->page &&
760                         off == frag->page_offset+frag->size;
761         }
762         return 0;
763 }
764
765 static inline void
766 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
767 {
768         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
769         frag->page = page;
770         frag->page_offset = off;
771         frag->size = size;
772         skb_shinfo(skb)->nr_frags = i+1;
773 }
774
775 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
776 {
777         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
778         tp->pushed_seq = tp->write_seq;
779 }
780
781 static inline int forced_push(struct tcp_opt *tp)
782 {
783         return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
784 }
785
786 static inline void
787 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
788 {
789         skb->csum = 0;
790         TCP_SKB_CB(skb)->seq = tp->write_seq;
791         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
792         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
793         TCP_SKB_CB(skb)->sacked = 0;
794         __skb_queue_tail(&sk->write_queue, skb);
795         tcp_charge_skb(sk, skb);
796         if (tp->send_head == NULL)
797                 tp->send_head = skb;
798 }
799
800 static inline void
801 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
802 {
803         if (flags & MSG_OOB) {
804                 tp->urg_mode = 1;
805                 tp->snd_up = tp->write_seq;
806                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
807         }
808 }
809
810 static inline void
811 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
812 {
813         if (tp->send_head) {
814                 struct sk_buff *skb = sk->write_queue.prev;
815                 if (!(flags&MSG_MORE) || forced_push(tp))
816                         tcp_mark_push(tp, skb);
817                 tcp_mark_urg(tp, flags, skb);
818                 __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
819         }
820 }
821
822 static int tcp_error(struct sock *sk, int flags, int err)
823 {
824         if (err == -EPIPE)
825                 err = sock_error(sk) ? : -EPIPE;
826         if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
827                 send_sig(SIGPIPE, current, 0);
828         return err;
829 }
830
831 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
832 {
833         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
834         int mss_now;
835         int err;
836         ssize_t copied;
837         long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
838
839         /* Wait for a connection to finish. */
840         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
841                 if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
842                         goto out_err;
843
844         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
845
846         mss_now = tcp_current_mss(sk);
847         copied = 0;
848
849         err = -EPIPE;
850         if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
851                 goto do_error;
852
853         while (psize > 0) {
854                 struct sk_buff *skb = sk->write_queue.prev;
855                 int offset, size, copy, i;
856                 struct page *page;
857
858                 page = pages[poffset/PAGE_SIZE];
859                 offset = poffset % PAGE_SIZE;
860                 size = min_t(size_t, psize, PAGE_SIZE-offset);
861
862                 if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
863 new_segment:
864                         if (!tcp_memory_free(sk))
865                                 goto wait_for_sndbuf;
866
867                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
868                         if (skb == NULL)
869                                 goto wait_for_memory;
870
871                         skb_entail(sk, tp, skb);
872                         copy = mss_now;
873                 }
874
875                 if (copy > size)
876                         copy = size;
877
878                 i = skb_shinfo(skb)->nr_frags;
879                 if (can_coalesce(skb, i, page, offset)) {
880                         skb_shinfo(skb)->frags[i-1].size += copy;
881                 } else if (i < MAX_SKB_FRAGS) {
882                         get_page(page);
883                         fill_page_desc(skb, i, page, offset, copy);
884                 } else {
885                         tcp_mark_push(tp, skb);
886                         goto new_segment;
887                 }
888
889                 skb->len += copy;
890                 skb->data_len += copy;
891                 skb->ip_summed = CHECKSUM_HW;
892                 tp->write_seq += copy;
893                 TCP_SKB_CB(skb)->end_seq += copy;
894
895                 if (!copied)
896                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
897
898                 copied += copy;
899                 poffset += copy;
900                 if (!(psize -= copy))
901                         goto out;
902
903                 if (skb->len != mss_now || (flags&MSG_OOB))
904                         continue;
905
906                 if (forced_push(tp)) {
907                         tcp_mark_push(tp, skb);
908                         __tcp_push_pending_frames(sk, tp, mss_now, 1);
909                 } else if (skb == tp->send_head)
910                         tcp_push_one(sk, mss_now);
911                 continue;
912
913 wait_for_sndbuf:
914                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
915 wait_for_memory:
916                 if (copied)
917                         tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
918
919                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
920                         goto do_error;
921
922                 mss_now = tcp_current_mss(sk);
923         }
924
925 out:
926         if (copied)
927                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
928         return copied;
929
930 do_error:
931         if (copied)
932                 goto out;
933 out_err:
934         return tcp_error(sk, flags, err);
935 }
936
937 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
938 {
939         ssize_t res;
940         struct sock *sk = sock->sk;
941
942 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
943
944         if (!(sk->route_caps & NETIF_F_SG) || 
945             !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
946                 return sock_no_sendpage(sock, page, offset, size, flags);
947
948 #undef TCP_ZC_CSUM_FLAGS
949
950         lock_sock(sk);
951         TCP_CHECK_TIMER(sk);
952         res = do_tcp_sendpages(sk, &page, offset, size, flags);
953         TCP_CHECK_TIMER(sk);
954         release_sock(sk);
955         return res;
956 }
957
958 #define TCP_PAGE(sk)    (sk->tp_pinfo.af_tcp.sndmsg_page)
959 #define TCP_OFF(sk)     (sk->tp_pinfo.af_tcp.sndmsg_off)
960
961 static inline int
962 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
963                  struct page *page, int off, int copy)
964 {
965         int err = 0;
966         unsigned int csum;
967
968         csum = csum_and_copy_from_user(from, page_address(page)+off,
969                                        copy, 0, &err);
970         if (!err) {
971                 if (skb->ip_summed == CHECKSUM_NONE)
972                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
973                 skb->len += copy;
974                 skb->data_len += copy;
975                 skb->truesize += copy;
976                 sk->wmem_queued += copy;
977                 sk->forward_alloc -= copy;
978         }
979         return err;
980 }
981
982 static inline int
983 skb_add_data(struct sk_buff *skb, char *from, int copy)
984 {
985         int err = 0;
986         unsigned int csum;
987         int off = skb->len;
988
989         csum = csum_and_copy_from_user(from, skb_put(skb, copy),
990                                        copy, 0, &err);
991         if (!err) {
992                 skb->csum = csum_block_add(skb->csum, csum, off);
993                 return 0;
994         }
995
996         __skb_trim(skb, off);
997         return -EFAULT;
998 }
999
1000 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1001 {
1002         int tmp = tp->mss_cache;
1003
1004         if (sk->route_caps&NETIF_F_SG) {
1005                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1006
1007                 if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1008                         tmp = pgbreak;
1009         }
1010         return tmp;
1011 }
1012
1013 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1014 {
1015         struct iovec *iov;
1016         struct tcp_opt *tp;
1017         struct sk_buff *skb;
1018         int iovlen, flags;
1019         int mss_now;
1020         int err, copied;
1021         long timeo;
1022
1023         tp = &(sk->tp_pinfo.af_tcp);
1024
1025         lock_sock(sk);
1026         TCP_CHECK_TIMER(sk);
1027
1028         flags = msg->msg_flags;
1029         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1030
1031         /* Wait for a connection to finish. */
1032         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1033                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1034                         goto out_err;
1035
1036         /* This should be in poll */
1037         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1038
1039         mss_now = tcp_current_mss(sk);
1040
1041         /* Ok commence sending. */
1042         iovlen = msg->msg_iovlen;
1043         iov = msg->msg_iov;
1044         copied = 0;
1045
1046         err = -EPIPE;
1047         if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1048                 goto do_error;
1049
1050         while (--iovlen >= 0) {
1051                 int seglen=iov->iov_len;
1052                 unsigned char * from=iov->iov_base;
1053
1054                 iov++;
1055
1056                 while (seglen > 0) {
1057                         int copy;
1058                         
1059                         skb = sk->write_queue.prev;
1060
1061                         if (tp->send_head == NULL ||
1062                             (copy = mss_now - skb->len) <= 0) {
1063
1064 new_segment:
1065                                 /* Allocate new segment. If the interface is SG,
1066                                  * allocate skb fitting to single page.
1067                                  */
1068                                 if (!tcp_memory_free(sk))
1069                                         goto wait_for_sndbuf;
1070
1071                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1072                                 if (skb == NULL)
1073                                         goto wait_for_memory;
1074
1075                                 skb_entail(sk, tp, skb);
1076                                 copy = mss_now;
1077                         }
1078
1079                         /* Try to append data to the end of skb. */
1080                         if (copy > seglen)
1081                                 copy = seglen;
1082
1083                         /* Where to copy to? */
1084                         if (skb_tailroom(skb) > 0) {
1085                                 /* We have some space in skb head. Superb! */
1086                                 if (copy > skb_tailroom(skb))
1087                                         copy = skb_tailroom(skb);
1088                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1089                                         goto do_fault;
1090                         } else {
1091                                 int merge = 0;
1092                                 int i = skb_shinfo(skb)->nr_frags;
1093                                 struct page *page = TCP_PAGE(sk);
1094                                 int off = TCP_OFF(sk);
1095
1096                                 if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1097                                         /* We can extend the last page fragment. */
1098                                         merge = 1;
1099                                 } else if (i == MAX_SKB_FRAGS ||
1100                                            (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1101                                         /* Need to add new fragment and cannot
1102                                          * do this because interface is non-SG,
1103                                          * or because all the page slots are busy.
1104                                          */
1105                                         tcp_mark_push(tp, skb);
1106                                         goto new_segment;
1107                                 } else if (page) {
1108                                         /* If page is cached, align
1109                                          * offset to L1 cache boundary
1110                                          */
1111                                         off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1112                                         if (off == PAGE_SIZE) {
1113                                                 put_page(page);
1114                                                 TCP_PAGE(sk) = page = NULL;
1115                                         }
1116                                 }
1117
1118                                 if (!page) {
1119                                         /* Allocate new cache page. */
1120                                         if (!(page=tcp_alloc_page(sk)))
1121                                                 goto wait_for_memory;
1122                                         off = 0;
1123                                 }
1124
1125                                 if (copy > PAGE_SIZE-off)
1126                                         copy = PAGE_SIZE-off;
1127
1128                                 /* Time to copy data. We are close to the end! */
1129                                 err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1130                                 if (err) {
1131                                         /* If this page was new, give it to the
1132                                          * socket so it does not get leaked.
1133                                          */
1134                                         if (TCP_PAGE(sk) == NULL) {
1135                                                 TCP_PAGE(sk) = page;
1136                                                 TCP_OFF(sk) = 0;
1137                                         }
1138                                         goto do_error;
1139                                 }
1140
1141                                 /* Update the skb. */
1142                                 if (merge) {
1143                                         skb_shinfo(skb)->frags[i-1].size += copy;
1144                                 } else {
1145                                         fill_page_desc(skb, i, page, off, copy);
1146                                         if (TCP_PAGE(sk)) {
1147                                                 get_page(page);
1148                                         } else if (off + copy < PAGE_SIZE) {
1149                                                 get_page(page);
1150                                                 TCP_PAGE(sk) = page;
1151                                         }
1152                                 }
1153
1154                                 TCP_OFF(sk) = off+copy;
1155                         }
1156
1157                         if (!copied)
1158                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1159
1160                         tp->write_seq += copy;
1161                         TCP_SKB_CB(skb)->end_seq += copy;
1162
1163                         from += copy;
1164                         copied += copy;
1165                         if ((seglen -= copy) == 0 && iovlen == 0)
1166                                 goto out;
1167
1168                         if (skb->len != mss_now || (flags&MSG_OOB))
1169                                 continue;
1170
1171                         if (forced_push(tp)) {
1172                                 tcp_mark_push(tp, skb);
1173                                 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1174                         } else if (skb == tp->send_head)
1175                                 tcp_push_one(sk, mss_now);
1176                         continue;
1177
1178 wait_for_sndbuf:
1179                         set_bit(SOCK_NOSPACE, &sk->socket->flags);
1180 wait_for_memory:
1181                         if (copied)
1182                                 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1183
1184                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1185                                 goto do_error;
1186
1187                         mss_now = tcp_current_mss(sk);
1188                 }
1189         }
1190
1191 out:
1192         if (copied)
1193                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1194         TCP_CHECK_TIMER(sk);
1195         release_sock(sk);
1196         return copied;
1197
1198 do_fault:
1199         if (skb->len == 0) {
1200                 if (tp->send_head == skb)
1201                         tp->send_head = NULL;
1202                 __skb_unlink(skb, skb->list);
1203                 tcp_free_skb(sk, skb);
1204         }
1205
1206 do_error:
1207         if (copied)
1208                 goto out;
1209 out_err:
1210         err = tcp_error(sk, flags, err);
1211         TCP_CHECK_TIMER(sk);
1212         release_sock(sk);
1213         return err;
1214 }
1215
1216 /*
1217  *      Handle reading urgent data. BSD has very simple semantics for
1218  *      this, no blocking and very strange errors 8)
1219  */
1220
1221 static int tcp_recv_urg(struct sock * sk, long timeo,
1222                         struct msghdr *msg, int len, int flags, 
1223                         int *addr_len)
1224 {
1225         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1226
1227         /* No URG data to read. */
1228         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1229                 return -EINVAL; /* Yes this is right ! */
1230
1231         if (sk->state==TCP_CLOSE && !sk->done)
1232                 return -ENOTCONN;
1233
1234         if (tp->urg_data & TCP_URG_VALID) {
1235                 int err = 0; 
1236                 char c = tp->urg_data;
1237
1238                 if (!(flags & MSG_PEEK))
1239                         tp->urg_data = TCP_URG_READ;
1240
1241                 /* Read urgent data. */
1242                 msg->msg_flags|=MSG_OOB;
1243
1244                 if(len>0) {
1245                         if (!(flags & MSG_TRUNC))
1246                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1247                         len = 1;
1248                 } else
1249                         msg->msg_flags|=MSG_TRUNC;
1250
1251                 return err ? -EFAULT : len;
1252         }
1253
1254         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1255                 return 0;
1256
1257         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1258          * the available implementations agree in this case:
1259          * this call should never block, independent of the
1260          * blocking state of the socket.
1261          * Mike <pall@rz.uni-karlsruhe.de>
1262          */
1263         return -EAGAIN;
1264 }
1265
1266 /*
1267  *      Release a skb if it is no longer needed. This routine
1268  *      must be called with interrupts disabled or with the
1269  *      socket locked so that the sk_buff queue operation is ok.
1270  */
1271
1272 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1273 {
1274         __skb_unlink(skb, &sk->receive_queue);
1275         __kfree_skb(skb);
1276 }
1277
1278 /* Clean up the receive buffer for full frames taken by the user,
1279  * then send an ACK if necessary.  COPIED is the number of bytes
1280  * tcp_recvmsg has given to the user so far, it speeds up the
1281  * calculation of whether or not we must ACK for the sake of
1282  * a window update.
1283  */
1284 static void cleanup_rbuf(struct sock *sk, int copied)
1285 {
1286         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1287         int time_to_ack = 0;
1288
1289 #if TCP_DEBUG
1290         struct sk_buff *skb = skb_peek(&sk->receive_queue);
1291
1292         BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1293 #endif
1294
1295         if (tcp_ack_scheduled(tp)) {
1296                    /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1297                 if (tp->ack.blocked
1298                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1299                     || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1300                     /*
1301                      * If this read emptied read buffer, we send ACK, if
1302                      * connection is not bidirectional, user drained
1303                      * receive buffer and there was a small segment
1304                      * in queue.
1305                      */
1306                     || (copied > 0 &&
1307                         (tp->ack.pending&TCP_ACK_PUSHED) &&
1308                         !tp->ack.pingpong &&
1309                         atomic_read(&sk->rmem_alloc) == 0)) {
1310                         time_to_ack = 1;
1311                 }
1312         }
1313
1314         /* We send an ACK if we can now advertise a non-zero window
1315          * which has been raised "significantly".
1316          *
1317          * Even if window raised up to infinity, do not send window open ACK
1318          * in states, where we will not receive more. It is useless.
1319          */
1320         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1321                 __u32 rcv_window_now = tcp_receive_window(tp);
1322
1323                 /* Optimize, __tcp_select_window() is not cheap. */
1324                 if (2*rcv_window_now <= tp->window_clamp) {
1325                         __u32 new_window = __tcp_select_window(sk);
1326
1327                         /* Send ACK now, if this read freed lots of space
1328                          * in our buffer. Certainly, new_window is new window.
1329                          * We can advertise it now, if it is not less than current one.
1330                          * "Lots" means "at least twice" here.
1331                          */
1332                         if(new_window && new_window >= 2*rcv_window_now)
1333                                 time_to_ack = 1;
1334                 }
1335         }
1336         if (time_to_ack)
1337                 tcp_send_ack(sk);
1338 }
1339
1340 /* Now socket state including sk->err is changed only under lock,
1341  * hence we may omit checks after joining wait queue.
1342  * We check receive queue before schedule() only as optimization;
1343  * it is very likely that release_sock() added new data.
1344  */
1345
1346 static long tcp_data_wait(struct sock *sk, long timeo)
1347 {
1348         DECLARE_WAITQUEUE(wait, current);
1349
1350         add_wait_queue(sk->sleep, &wait);
1351
1352         __set_current_state(TASK_INTERRUPTIBLE);
1353
1354         set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1355         release_sock(sk);
1356
1357         if (skb_queue_empty(&sk->receive_queue))
1358                 timeo = schedule_timeout(timeo);
1359
1360         lock_sock(sk);
1361         clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1362
1363         remove_wait_queue(sk->sleep, &wait);
1364         __set_current_state(TASK_RUNNING);
1365         return timeo;
1366 }
1367
1368 static void tcp_prequeue_process(struct sock *sk)
1369 {
1370         struct sk_buff *skb;
1371         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1372
1373         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1374
1375         /* RX process wants to run with disabled BHs, though it is not necessary */
1376         local_bh_disable();
1377         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1378                 sk->backlog_rcv(sk, skb);
1379         local_bh_enable();
1380
1381         /* Clear memory counter. */
1382         tp->ucopy.memory = 0;
1383 }
1384
1385 static inline
1386 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1387 {
1388         struct sk_buff *skb;
1389         u32 offset;
1390
1391         skb_queue_walk(&sk->receive_queue, skb) {
1392                 offset = seq - TCP_SKB_CB(skb)->seq;
1393                 if (skb->h.th->syn)
1394                         offset--;
1395                 if (offset < skb->len || skb->h.th->fin) {
1396                         *off = offset;
1397                         return skb;
1398                 }
1399         }
1400         return NULL;
1401 }
1402
1403 /*
1404  * This routine provides an alternative to tcp_recvmsg() for routines
1405  * that would like to handle copying from skbuffs directly in 'sendfile'
1406  * fashion.
1407  * Note:
1408  *      - It is assumed that the socket was locked by the caller.
1409  *      - The routine does not block.
1410  *      - At present, there is no support for reading OOB data
1411  *        or for 'peeking' the socket using this routine
1412  *        (although both would be easy to implement).
1413  */
1414 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1415                   sk_read_actor_t recv_actor)
1416 {
1417         struct sk_buff *skb;
1418         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1419         u32 seq = tp->copied_seq;
1420         u32 offset;
1421         int copied = 0;
1422
1423         if (sk->state == TCP_LISTEN)
1424                 return -ENOTCONN;
1425         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1426                 if (offset < skb->len) {
1427                         size_t used, len;
1428
1429                         len = skb->len - offset;
1430                         /* Stop reading if we hit a patch of urgent data */
1431                         if (tp->urg_data) {
1432                                 u32 urg_offset = tp->urg_seq - seq;
1433                                 if (urg_offset < len)
1434                                         len = urg_offset;
1435                                 if (!len)
1436                                         break;
1437                         }
1438                         used = recv_actor(desc, skb, offset, len);
1439                         if (used <= len) {
1440                                 seq += used;
1441                                 copied += used;
1442                                 offset += used;
1443                         }
1444                         if (offset != skb->len)
1445                                 break;
1446                 }
1447                 if (skb->h.th->fin) {
1448                         tcp_eat_skb(sk, skb);
1449                         ++seq;
1450                         break;
1451                 }
1452                 tcp_eat_skb(sk, skb);
1453                 if (!desc->count)
1454                         break;
1455         }
1456         tp->copied_seq = seq;
1457
1458         tcp_rcv_space_adjust(sk);
1459
1460         /* Clean up data we have read: This will do ACK frames. */
1461         if (copied)
1462                 cleanup_rbuf(sk, copied);
1463         return copied;
1464 }
1465
1466 /*
1467  *      This routine copies from a sock struct into the user buffer. 
1468  *
1469  *      Technical note: in 2.3 we work on _locked_ socket, so that
1470  *      tricks with *seq access order and skb->users are not required.
1471  *      Probably, code can be easily improved even more.
1472  */
1473  
1474 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1475                 int len, int nonblock, int flags, int *addr_len)
1476 {
1477         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1478         int copied = 0;
1479         u32 peek_seq;
1480         u32 *seq;
1481         unsigned long used;
1482         int err;
1483         int target;             /* Read at least this many bytes */
1484         long timeo;
1485         struct task_struct *user_recv = NULL;
1486
1487         lock_sock(sk);
1488
1489         TCP_CHECK_TIMER(sk);
1490
1491         err = -ENOTCONN;
1492         if (sk->state == TCP_LISTEN)
1493                 goto out;
1494
1495         timeo = sock_rcvtimeo(sk, nonblock);
1496
1497         /* Urgent data needs to be handled specially. */
1498         if (flags & MSG_OOB)
1499                 goto recv_urg;
1500
1501         seq = &tp->copied_seq;
1502         if (flags & MSG_PEEK) {
1503                 peek_seq = tp->copied_seq;
1504                 seq = &peek_seq;
1505         }
1506
1507         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1508
1509         do {
1510                 struct sk_buff * skb;
1511                 u32 offset;
1512
1513                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1514                 if (tp->urg_data && tp->urg_seq == *seq) {
1515                         if (copied)
1516                                 break;
1517                         if (signal_pending(current)) {
1518                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1519                                 break;
1520                         }
1521                 }
1522
1523                 /* Next get a buffer. */
1524
1525                 skb = skb_peek(&sk->receive_queue);
1526                 do {
1527                         if (!skb)
1528                                 break;
1529
1530                         /* Now that we have two receive queues this 
1531                          * shouldn't happen.
1532                          */
1533                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1534                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1535                                        *seq, TCP_SKB_CB(skb)->seq);
1536                                 break;
1537                         }
1538                         offset = *seq - TCP_SKB_CB(skb)->seq;
1539                         if (skb->h.th->syn)
1540                                 offset--;
1541                         if (offset < skb->len)
1542                                 goto found_ok_skb;
1543                         if (skb->h.th->fin)
1544                                 goto found_fin_ok;
1545                         BUG_TRAP(flags&MSG_PEEK);
1546                         skb = skb->next;
1547                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1548
1549                 /* Well, if we have backlog, try to process it now yet. */
1550
1551                 if (copied >= target && sk->backlog.tail == NULL)
1552                         break;
1553
1554                 if (copied) {
1555                         if (sk->err ||
1556                             sk->state == TCP_CLOSE ||
1557                             (sk->shutdown & RCV_SHUTDOWN) ||
1558                             !timeo ||
1559                             signal_pending(current) ||
1560                             (flags & MSG_PEEK))
1561                                 break;
1562                 } else {
1563                         if (sk->done)
1564                                 break;
1565
1566                         if (sk->err) {
1567                                 copied = sock_error(sk);
1568                                 break;
1569                         }
1570
1571                         if (sk->shutdown & RCV_SHUTDOWN)
1572                                 break;
1573
1574                         if (sk->state == TCP_CLOSE) {
1575                                 if (!sk->done) {
1576                                         /* This occurs when user tries to read
1577                                          * from never connected socket.
1578                                          */
1579                                         copied = -ENOTCONN;
1580                                         break;
1581                                 }
1582                                 break;
1583                         }
1584
1585                         if (!timeo) {
1586                                 copied = -EAGAIN;
1587                                 break;
1588                         }
1589
1590                         if (signal_pending(current)) {
1591                                 copied = sock_intr_errno(timeo);
1592                                 break;
1593                         }
1594                 }
1595
1596                 cleanup_rbuf(sk, copied);
1597
1598                 if (tp->ucopy.task == user_recv) {
1599                         /* Install new reader */
1600                         if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1601                                 user_recv = current;
1602                                 tp->ucopy.task = user_recv;
1603                                 tp->ucopy.iov = msg->msg_iov;
1604                         }
1605
1606                         tp->ucopy.len = len;
1607
1608                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1609
1610                         /* Ugly... If prequeue is not empty, we have to
1611                          * process it before releasing socket, otherwise
1612                          * order will be broken at second iteration.
1613                          * More elegant solution is required!!!
1614                          *
1615                          * Look: we have the following (pseudo)queues:
1616                          *
1617                          * 1. packets in flight
1618                          * 2. backlog
1619                          * 3. prequeue
1620                          * 4. receive_queue
1621                          *
1622                          * Each queue can be processed only if the next ones
1623                          * are empty. At this point we have empty receive_queue.
1624                          * But prequeue _can_ be not empty after second iteration,
1625                          * when we jumped to start of loop because backlog
1626                          * processing added something to receive_queue.
1627                          * We cannot release_sock(), because backlog contains
1628                          * packets arrived _after_ prequeued ones.
1629                          *
1630                          * Shortly, algorithm is clear --- to process all
1631                          * the queues in order. We could make it more directly,
1632                          * requeueing packets from backlog to prequeue, if
1633                          * is not empty. It is more elegant, but eats cycles,
1634                          * unfortunately.
1635                          */
1636                         if (skb_queue_len(&tp->ucopy.prequeue))
1637                                 goto do_prequeue;
1638
1639                         /* __ Set realtime policy in scheduler __ */
1640                 }
1641
1642                 if (copied >= target) {
1643                         /* Do not sleep, just process backlog. */
1644                         release_sock(sk);
1645                         lock_sock(sk);
1646                 } else {
1647                         timeo = tcp_data_wait(sk, timeo);
1648                 }
1649
1650                 if (user_recv) {
1651                         int chunk;
1652
1653                         /* __ Restore normal policy in scheduler __ */
1654
1655                         if ((chunk = len - tp->ucopy.len) != 0) {
1656                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1657                                 len -= chunk;
1658                                 copied += chunk;
1659                         }
1660
1661                         if (tp->rcv_nxt == tp->copied_seq &&
1662                             skb_queue_len(&tp->ucopy.prequeue)) {
1663 do_prequeue:
1664                                 tcp_prequeue_process(sk);
1665
1666                                 if ((chunk = len - tp->ucopy.len) != 0) {
1667                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1668                                         len -= chunk;
1669                                         copied += chunk;
1670                                 }
1671                         }
1672                 }
1673                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1674                         if (net_ratelimit())
1675                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1676                                        current->comm, current->pid);
1677                         peek_seq = tp->copied_seq;
1678                 }
1679                 continue;
1680
1681         found_ok_skb:
1682                 /* Ok so how much can we use? */
1683                 used = skb->len - offset;
1684                 if (len < used)
1685                         used = len;
1686
1687                 /* Do we have urgent data here? */
1688                 if (tp->urg_data) {
1689                         u32 urg_offset = tp->urg_seq - *seq;
1690                         if (urg_offset < used) {
1691                                 if (!urg_offset) {
1692                                         if (!sk->urginline) {
1693                                                 ++*seq;
1694                                                 offset++;
1695                                                 used--;
1696                                                 if (!used)
1697                                                         goto skip_copy;
1698                                         }
1699                                 } else
1700                                         used = urg_offset;
1701                         }
1702                 }
1703
1704                 if (!(flags&MSG_TRUNC)) {
1705                         err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1706                         if (err) {
1707                                 /* Exception. Bailout! */
1708                                 if (!copied)
1709                                         copied = -EFAULT;
1710                                 break;
1711                         }
1712                 }
1713
1714                 *seq += used;
1715                 copied += used;
1716                 len -= used;
1717
1718                 tcp_rcv_space_adjust(sk);
1719
1720 skip_copy:
1721                 if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1722                         tp->urg_data = 0;
1723                         tcp_fast_path_check(sk, tp);
1724                 }
1725                 if (used + offset < skb->len)
1726                         continue;
1727
1728                 if (skb->h.th->fin)
1729                         goto found_fin_ok;
1730                 if (!(flags & MSG_PEEK))
1731                         tcp_eat_skb(sk, skb);
1732                 continue;
1733
1734         found_fin_ok:
1735                 /* Process the FIN. */
1736                 ++*seq;
1737                 if (!(flags & MSG_PEEK))
1738                         tcp_eat_skb(sk, skb);
1739                 break;
1740         } while (len > 0);
1741
1742         if (user_recv) {
1743                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1744                         int chunk;
1745
1746                         tp->ucopy.len = copied > 0 ? len : 0;
1747
1748                         tcp_prequeue_process(sk);
1749
1750                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1751                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1752                                 len -= chunk;
1753                                 copied += chunk;
1754                         }
1755                 }
1756
1757                 tp->ucopy.task = NULL;
1758                 tp->ucopy.len = 0;
1759         }
1760
1761         /* According to UNIX98, msg_name/msg_namelen are ignored
1762          * on connected socket. I was just happy when found this 8) --ANK
1763          */
1764
1765         /* Clean up data we have read: This will do ACK frames. */
1766         cleanup_rbuf(sk, copied);
1767
1768         TCP_CHECK_TIMER(sk);
1769         release_sock(sk);
1770         return copied;
1771
1772 out:
1773         TCP_CHECK_TIMER(sk);
1774         release_sock(sk);
1775         return err;
1776
1777 recv_urg:
1778         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1779         goto out;
1780 }
1781
1782 /*
1783  *      State processing on a close. This implements the state shift for
1784  *      sending our FIN frame. Note that we only send a FIN for some
1785  *      states. A shutdown() may have already sent the FIN, or we may be
1786  *      closed.
1787  */
1788
1789 static unsigned char new_state[16] = {
1790   /* current state:        new state:      action:      */
1791   /* (Invalid)          */ TCP_CLOSE,
1792   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1793   /* TCP_SYN_SENT       */ TCP_CLOSE,
1794   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1795   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1796   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1797   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1798   /* TCP_CLOSE          */ TCP_CLOSE,
1799   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1800   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1801   /* TCP_LISTEN         */ TCP_CLOSE,
1802   /* TCP_CLOSING        */ TCP_CLOSING,
1803 };
1804
1805 static int tcp_close_state(struct sock *sk)
1806 {
1807         int next = (int) new_state[sk->state];
1808         int ns = (next & TCP_STATE_MASK);
1809
1810         tcp_set_state(sk, ns);
1811
1812         return (next & TCP_ACTION_FIN);
1813 }
1814
1815 /*
1816  *      Shutdown the sending side of a connection. Much like close except
1817  *      that we don't receive shut down or set sk->dead.
1818  */
1819
1820 void tcp_shutdown(struct sock *sk, int how)
1821 {
1822         /*      We need to grab some memory, and put together a FIN,
1823          *      and then put it into the queue to be sent.
1824          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1825          */
1826         if (!(how & SEND_SHUTDOWN))
1827                 return;
1828
1829         /* If we've already sent a FIN, or it's a closed state, skip this. */
1830         if ((1 << sk->state) &
1831             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1832                 /* Clear out any half completed packets.  FIN if needed. */
1833                 if (tcp_close_state(sk))
1834                         tcp_send_fin(sk);
1835         }
1836 }
1837
1838
1839 /*
1840  *      Return 1 if we still have things to send in our buffers.
1841  */
1842
1843 static inline int closing(struct sock * sk)
1844 {
1845         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1846 }
1847
1848 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1849 {
1850         /* First the read buffer. */
1851         __skb_queue_purge(&sk->receive_queue);
1852
1853         /* Next, the error queue. */
1854         __skb_queue_purge(&sk->error_queue);
1855
1856         /* Next, the write queue. */
1857         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1858
1859         /* Account for returned memory. */
1860         tcp_mem_reclaim(sk);
1861
1862         BUG_TRAP(sk->wmem_queued == 0);
1863         BUG_TRAP(sk->forward_alloc == 0);
1864
1865         /* It is _impossible_ for the backlog to contain anything
1866          * when we get here.  All user references to this socket
1867          * have gone away, only the net layer knows can touch it.
1868          */
1869 }
1870
1871 /*
1872  * At this point, there should be no process reference to this
1873  * socket, and thus no user references at all.  Therefore we
1874  * can assume the socket waitqueue is inactive and nobody will
1875  * try to jump onto it.
1876  */
1877 void tcp_destroy_sock(struct sock *sk)
1878 {
1879         BUG_TRAP(sk->state==TCP_CLOSE);
1880         BUG_TRAP(sk->dead);
1881
1882         /* It cannot be in hash table! */
1883         BUG_TRAP(sk->pprev==NULL);
1884
1885         /* If it has not 0 sk->num, it must be bound */
1886         BUG_TRAP(!sk->num || sk->prev!=NULL);
1887
1888 #ifdef TCP_DEBUG
1889         if (sk->zapped) {
1890                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1891                 sock_hold(sk);
1892         }
1893         sk->zapped = 1;
1894 #endif
1895
1896         sk->prot->destroy(sk);
1897
1898         tcp_kill_sk_queues(sk);
1899
1900 #ifdef INET_REFCNT_DEBUG
1901         if (atomic_read(&sk->refcnt) != 1) {
1902                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1903         }
1904 #endif
1905
1906         atomic_dec(&tcp_orphan_count);
1907         sock_put(sk);
1908 }
1909
1910 void tcp_close(struct sock *sk, long timeout)
1911 {
1912         struct sk_buff *skb;
1913         int data_was_unread = 0;
1914
1915         lock_sock(sk);
1916         sk->shutdown = SHUTDOWN_MASK;
1917
1918         if(sk->state == TCP_LISTEN) {
1919                 tcp_set_state(sk, TCP_CLOSE);
1920
1921                 /* Special case. */
1922                 tcp_listen_stop(sk);
1923
1924                 goto adjudge_to_death;
1925         }
1926
1927         /*  We need to flush the recv. buffs.  We do this only on the
1928          *  descriptor close, not protocol-sourced closes, because the
1929          *  reader process may not have drained the data yet!
1930          */
1931         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1932                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1933                 data_was_unread += len;
1934                 __kfree_skb(skb);
1935         }
1936
1937         tcp_mem_reclaim(sk);
1938
1939         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1940          * 3.10, we send a RST here because data was lost.  To
1941          * witness the awful effects of the old behavior of always
1942          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1943          * a bulk GET in an FTP client, suspend the process, wait
1944          * for the client to advertise a zero window, then kill -9
1945          * the FTP client, wheee...  Note: timeout is always zero
1946          * in such a case.
1947          */
1948         if(data_was_unread != 0) {
1949                 /* Unread data was tossed, zap the connection. */
1950                 NET_INC_STATS_USER(TCPAbortOnClose);
1951                 tcp_set_state(sk, TCP_CLOSE);
1952                 tcp_send_active_reset(sk, GFP_KERNEL);
1953         } else if (sk->linger && sk->lingertime==0) {
1954                 /* Check zero linger _after_ checking for unread data. */
1955                 sk->prot->disconnect(sk, 0);
1956                 NET_INC_STATS_USER(TCPAbortOnData);
1957         } else if (tcp_close_state(sk)) {
1958                 /* We FIN if the application ate all the data before
1959                  * zapping the connection.
1960                  */
1961
1962                 /* RED-PEN. Formally speaking, we have broken TCP state
1963                  * machine. State transitions:
1964                  *
1965                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1966                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1967                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1968                  *
1969                  * are legal only when FIN has been sent (i.e. in window),
1970                  * rather than queued out of window. Purists blame.
1971                  *
1972                  * F.e. "RFC state" is ESTABLISHED,
1973                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1974                  *
1975                  * The visible declinations are that sometimes
1976                  * we enter time-wait state, when it is not required really
1977                  * (harmless), do not send active resets, when they are
1978                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1979                  * they look as CLOSING or LAST_ACK for Linux)
1980                  * Probably, I missed some more holelets.
1981                  *                                              --ANK
1982                  */
1983                 tcp_send_fin(sk);
1984         }
1985
1986         if (timeout) {
1987                 struct task_struct *tsk = current;
1988                 DECLARE_WAITQUEUE(wait, current);
1989
1990                 add_wait_queue(sk->sleep, &wait);
1991
1992                 do {
1993                         set_current_state(TASK_INTERRUPTIBLE);
1994                         if (!closing(sk))
1995                                 break;
1996                         release_sock(sk);
1997                         timeout = schedule_timeout(timeout);
1998                         lock_sock(sk);
1999                 } while (!signal_pending(tsk) && timeout);
2000
2001                 tsk->state = TASK_RUNNING;
2002                 remove_wait_queue(sk->sleep, &wait);
2003         }
2004
2005 adjudge_to_death:
2006         /* It is the last release_sock in its life. It will remove backlog. */
2007         release_sock(sk);
2008
2009
2010         /* Now socket is owned by kernel and we acquire BH lock
2011            to finish close. No need to check for user refs.
2012          */
2013         local_bh_disable();
2014         bh_lock_sock(sk);
2015         BUG_TRAP(sk->lock.users==0);
2016
2017         sock_hold(sk);
2018         sock_orphan(sk);
2019
2020         /*      This is a (useful) BSD violating of the RFC. There is a
2021          *      problem with TCP as specified in that the other end could
2022          *      keep a socket open forever with no application left this end.
2023          *      We use a 3 minute timeout (about the same as BSD) then kill
2024          *      our end. If they send after that then tough - BUT: long enough
2025          *      that we won't make the old 4*rto = almost no time - whoops
2026          *      reset mistake.
2027          *
2028          *      Nope, it was not mistake. It is really desired behaviour
2029          *      f.e. on http servers, when such sockets are useless, but
2030          *      consume significant resources. Let's do it with special
2031          *      linger2 option.                                 --ANK
2032          */
2033
2034         if (sk->state == TCP_FIN_WAIT2) {
2035                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2036                 if (tp->linger2 < 0) {
2037                         tcp_set_state(sk, TCP_CLOSE);
2038                         tcp_send_active_reset(sk, GFP_ATOMIC);
2039                         NET_INC_STATS_BH(TCPAbortOnLinger);
2040                 } else {
2041                         int tmo = tcp_fin_time(tp);
2042
2043                         if (tmo > TCP_TIMEWAIT_LEN) {
2044                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2045                         } else {
2046                                 atomic_inc(&tcp_orphan_count);
2047                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2048                                 goto out;
2049                         }
2050                 }
2051         }
2052         if (sk->state != TCP_CLOSE) {
2053                 tcp_mem_reclaim(sk);
2054                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2055                     (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2056                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2057                         if (net_ratelimit())
2058                                 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2059                         tcp_set_state(sk, TCP_CLOSE);
2060                         tcp_send_active_reset(sk, GFP_ATOMIC);
2061                         NET_INC_STATS_BH(TCPAbortOnMemory);
2062                 }
2063         }
2064         atomic_inc(&tcp_orphan_count);
2065
2066         if (sk->state == TCP_CLOSE)
2067                 tcp_destroy_sock(sk);
2068         /* Otherwise, socket is reprieved until protocol close. */
2069
2070 out:
2071         bh_unlock_sock(sk);
2072         local_bh_enable();
2073         sock_put(sk);
2074 }
2075
2076 /* These states need RST on ABORT according to RFC793 */
2077
2078 static inline int tcp_need_reset(int state)
2079 {
2080         return ((1 << state) &
2081                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2082                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2083 }
2084
2085 int tcp_disconnect(struct sock *sk, int flags)
2086 {
2087         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2088         int old_state;
2089         int err = 0;
2090
2091         old_state = sk->state;
2092         if (old_state != TCP_CLOSE)
2093                 tcp_set_state(sk, TCP_CLOSE);
2094
2095         /* ABORT function of RFC793 */
2096         if (old_state == TCP_LISTEN) {
2097                 tcp_listen_stop(sk);
2098         } else if (tcp_need_reset(old_state) ||
2099                    (tp->snd_nxt != tp->write_seq &&
2100                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2101                 /* The last check adjusts for discrepance of Linux wrt. RFC
2102                  * states
2103                  */
2104                 tcp_send_active_reset(sk, gfp_any());
2105                 sk->err = ECONNRESET;
2106         } else if (old_state == TCP_SYN_SENT)
2107                 sk->err = ECONNRESET;
2108
2109         tcp_clear_xmit_timers(sk);
2110         __skb_queue_purge(&sk->receive_queue);
2111         tcp_writequeue_purge(sk);
2112         __skb_queue_purge(&tp->out_of_order_queue);
2113
2114         sk->dport = 0;
2115
2116         if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2117                 sk->rcv_saddr = 0;
2118                 sk->saddr = 0;
2119 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2120                 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2121                 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2122 #endif
2123         }
2124
2125         sk->shutdown = 0;
2126         sk->done = 0;
2127         tp->srtt = 0;
2128         if ((tp->write_seq += tp->max_window+2) == 0)
2129                 tp->write_seq = 1;
2130         tp->backoff = 0;
2131         tp->snd_cwnd = 2;
2132         tp->probes_out = 0;
2133         tp->packets_out = 0;
2134         tp->snd_ssthresh = 0x7fffffff;
2135         tp->snd_cwnd_cnt = 0;
2136         tcp_set_ca_state(tp, TCP_CA_Open);
2137         tcp_clear_retrans(tp);
2138         tcp_delack_init(tp);
2139         tp->send_head = NULL;
2140         tp->saw_tstamp = 0;
2141         tcp_sack_reset(tp);
2142         __sk_dst_reset(sk);
2143
2144         BUG_TRAP(!sk->num || sk->prev);
2145
2146         sk->error_report(sk);
2147         return err;
2148 }
2149
2150 /*
2151  *      Wait for an incoming connection, avoid race
2152  *      conditions. This must be called with the socket locked.
2153  */
2154 static int wait_for_connect(struct sock * sk, long timeo)
2155 {
2156         DECLARE_WAITQUEUE(wait, current);
2157         int err;
2158
2159         /*
2160          * True wake-one mechanism for incoming connections: only
2161          * one process gets woken up, not the 'whole herd'.
2162          * Since we do not 'race & poll' for established sockets
2163          * anymore, the common case will execute the loop only once.
2164          *
2165          * Subtle issue: "add_wait_queue_exclusive()" will be added
2166          * after any current non-exclusive waiters, and we know that
2167          * it will always _stay_ after any new non-exclusive waiters
2168          * because all non-exclusive waiters are added at the
2169          * beginning of the wait-queue. As such, it's ok to "drop"
2170          * our exclusiveness temporarily when we get woken up without
2171          * having to remove and re-insert us on the wait queue.
2172          */
2173         add_wait_queue_exclusive(sk->sleep, &wait);
2174         for (;;) {
2175                 current->state = TASK_INTERRUPTIBLE;
2176                 release_sock(sk);
2177                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2178                         timeo = schedule_timeout(timeo);
2179                 lock_sock(sk);
2180                 err = 0;
2181                 if (sk->tp_pinfo.af_tcp.accept_queue)
2182                         break;
2183                 err = -EINVAL;
2184                 if (sk->state != TCP_LISTEN)
2185                         break;
2186                 err = sock_intr_errno(timeo);
2187                 if (signal_pending(current))
2188                         break;
2189                 err = -EAGAIN;
2190                 if (!timeo)
2191                         break;
2192         }
2193         current->state = TASK_RUNNING;
2194         remove_wait_queue(sk->sleep, &wait);
2195         return err;
2196 }
2197
2198 /*
2199  *      This will accept the next outstanding connection.
2200  */
2201
2202 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2203 {
2204         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2205         struct open_request *req;
2206         struct sock *newsk;
2207         int error;
2208
2209         lock_sock(sk); 
2210
2211         /* We need to make sure that this socket is listening,
2212          * and that it has something pending.
2213          */
2214         error = -EINVAL;
2215         if (sk->state != TCP_LISTEN)
2216                 goto out;
2217
2218         /* Find already established connection */
2219         if (!tp->accept_queue) {
2220                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2221
2222                 /* If this is a non blocking socket don't sleep */
2223                 error = -EAGAIN;
2224                 if (!timeo)
2225                         goto out;
2226
2227                 error = wait_for_connect(sk, timeo);
2228                 if (error)
2229                         goto out;
2230         }
2231
2232         req = tp->accept_queue;
2233         if ((tp->accept_queue = req->dl_next) == NULL)
2234                 tp->accept_queue_tail = NULL;
2235
2236         newsk = req->sk;
2237         tcp_acceptq_removed(sk);
2238         tcp_openreq_fastfree(req);
2239         BUG_TRAP(newsk->state != TCP_SYN_RECV);
2240         release_sock(sk);
2241         return newsk;
2242
2243 out:
2244         release_sock(sk);
2245         *err = error; 
2246         return NULL;
2247 }
2248
2249 /*
2250  *      Socket option code for TCP. 
2251  */
2252   
2253 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, 
2254                    int optlen)
2255 {
2256         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2257         int val;
2258         int err = 0;
2259
2260         if (level != SOL_TCP)
2261                 return tp->af_specific->setsockopt(sk, level, optname, 
2262                                                    optval, optlen);
2263
2264         if(optlen<sizeof(int))
2265                 return -EINVAL;
2266
2267         if (get_user(val, (int *)optval))
2268                 return -EFAULT;
2269
2270         lock_sock(sk);
2271
2272         switch(optname) {
2273         case TCP_MAXSEG:
2274                 /* values greater than interface MTU won't take effect.  however at
2275                  * the point when this call is done we typically don't yet know
2276                  * which interface is going to be used
2277                  */
2278                 if(val < 8 || val > MAX_TCP_WINDOW) {
2279                         err = -EINVAL;
2280                         break;
2281                 }
2282                 tp->user_mss = val;
2283                 break;
2284
2285         case TCP_NODELAY:
2286                 /* You cannot try to use this and TCP_CORK in
2287                  * tandem, so let the user know.
2288                  */
2289                 if (tp->nonagle == 2) {
2290                         err = -EINVAL;
2291                         break;
2292                 }
2293                 tp->nonagle = (val == 0) ? 0 : 1;
2294                 if (val)
2295                         tcp_push_pending_frames(sk, tp);
2296                 break;
2297
2298         case TCP_CORK:
2299                 /* When set indicates to always queue non-full frames.
2300                  * Later the user clears this option and we transmit
2301                  * any pending partial frames in the queue.  This is
2302                  * meant to be used alongside sendfile() to get properly
2303                  * filled frames when the user (for example) must write
2304                  * out headers with a write() call first and then use
2305                  * sendfile to send out the data parts.
2306                  *
2307                  * You cannot try to use TCP_NODELAY and this mechanism
2308                  * at the same time, so let the user know.
2309                  */
2310                 if (tp->nonagle == 1) {
2311                         err = -EINVAL;
2312                         break;
2313                 }
2314                 if (val != 0) {
2315                         tp->nonagle = 2;
2316                 } else {
2317                         tp->nonagle = 0;
2318
2319                         tcp_push_pending_frames(sk, tp);
2320                 }
2321                 break;
2322                 
2323         case TCP_KEEPIDLE:
2324                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2325                         err = -EINVAL;
2326                 else {
2327                         tp->keepalive_time = val * HZ;
2328                         if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2329                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2330                                 if (tp->keepalive_time > elapsed)
2331                                         elapsed = tp->keepalive_time - elapsed;
2332                                 else
2333                                         elapsed = 0;
2334                                 tcp_reset_keepalive_timer(sk, elapsed);
2335                         }
2336                 }
2337                 break;
2338         case TCP_KEEPINTVL:
2339                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2340                         err = -EINVAL;
2341                 else
2342                         tp->keepalive_intvl = val * HZ;
2343                 break;
2344         case TCP_KEEPCNT:
2345                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2346                         err = -EINVAL;
2347                 else
2348                         tp->keepalive_probes = val;
2349                 break;
2350         case TCP_SYNCNT:
2351                 if (val < 1 || val > MAX_TCP_SYNCNT)
2352                         err = -EINVAL;
2353                 else
2354                         tp->syn_retries = val;
2355                 break;
2356
2357         case TCP_LINGER2:
2358                 if (val < 0)
2359                         tp->linger2 = -1;
2360                 else if (val > sysctl_tcp_fin_timeout/HZ)
2361                         tp->linger2 = 0;
2362                 else
2363                         tp->linger2 = val*HZ;
2364                 break;
2365
2366         case TCP_DEFER_ACCEPT:
2367                 tp->defer_accept = 0;
2368                 if (val > 0) {
2369                         /* Translate value in seconds to number of retransmits */
2370                         while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2371                                 tp->defer_accept++;
2372                         tp->defer_accept++;
2373                 }
2374                 break;
2375
2376         case TCP_WINDOW_CLAMP:
2377                 if (val==0) {
2378                         if (sk->state != TCP_CLOSE) {
2379                                 err = -EINVAL;
2380                                 break;
2381                         }
2382                         tp->window_clamp = 0;
2383                 } else {
2384                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2385                                 SOCK_MIN_RCVBUF/2 : val;
2386                 }
2387                 break;
2388
2389         case TCP_QUICKACK:
2390                 if (!val) {
2391                         tp->ack.pingpong = 1;
2392                 } else {
2393                         tp->ack.pingpong = 0;
2394                         if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2395                             tcp_ack_scheduled(tp)) {
2396                                 tp->ack.pending |= TCP_ACK_PUSHED;
2397                                 cleanup_rbuf(sk, 1);
2398                                 if (!(val & 1))
2399                                         tp->ack.pingpong = 1;
2400                         }
2401                 }
2402                 break;
2403
2404         default:
2405                 err = -ENOPROTOOPT;
2406                 break;
2407         };
2408         release_sock(sk);
2409         return err;
2410 }
2411
2412 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2413                    int *optlen)
2414 {
2415         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2416         int val, len;
2417
2418         if(level != SOL_TCP)
2419                 return tp->af_specific->getsockopt(sk, level, optname,
2420                                                    optval, optlen);
2421
2422         if(get_user(len,optlen))
2423                 return -EFAULT;
2424
2425         len = min_t(unsigned int, len, sizeof(int));
2426         
2427         if(len < 0)
2428                 return -EINVAL;
2429
2430         switch(optname) {
2431         case TCP_MAXSEG:
2432                 val = tp->mss_cache;
2433                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2434                         val = tp->user_mss;
2435                 break;
2436         case TCP_NODELAY:
2437                 val = (tp->nonagle == 1);
2438                 break;
2439         case TCP_CORK:
2440                 val = (tp->nonagle == 2);
2441                 break;
2442         case TCP_KEEPIDLE:
2443                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2444                 break;
2445         case TCP_KEEPINTVL:
2446                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2447                 break;
2448         case TCP_KEEPCNT:
2449                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2450                 break;
2451         case TCP_SYNCNT:
2452                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2453                 break;
2454         case TCP_LINGER2:
2455                 val = tp->linger2;
2456                 if (val >= 0)
2457                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2458                 break;
2459         case TCP_DEFER_ACCEPT:
2460                 val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2461                 break;
2462         case TCP_WINDOW_CLAMP:
2463                 val = tp->window_clamp;
2464                 break;
2465         case TCP_INFO:
2466         {
2467                 struct tcp_info info;
2468                 u32 now = tcp_time_stamp;
2469
2470                 if(get_user(len,optlen))
2471                         return -EFAULT;
2472                 info.tcpi_state = sk->state;
2473                 info.tcpi_ca_state = tp->ca_state;
2474                 info.tcpi_retransmits = tp->retransmits;
2475                 info.tcpi_probes = tp->probes_out;
2476                 info.tcpi_backoff = tp->backoff;
2477                 info.tcpi_options = 0;
2478                 if (tp->tstamp_ok)
2479                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2480                 if (tp->sack_ok)
2481                         info.tcpi_options |= TCPI_OPT_SACK;
2482                 if (tp->wscale_ok) {
2483                         info.tcpi_options |= TCPI_OPT_WSCALE;
2484                         info.tcpi_snd_wscale = tp->snd_wscale;
2485                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2486                 } else {
2487                         info.tcpi_snd_wscale = 0;
2488                         info.tcpi_rcv_wscale = 0;
2489                 }
2490                 if (tp->ecn_flags&TCP_ECN_OK)
2491                         info.tcpi_options |= TCPI_OPT_ECN;
2492
2493                 info.tcpi_rto = (1000000*tp->rto)/HZ;
2494                 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2495                 info.tcpi_snd_mss = tp->mss_cache;
2496                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2497
2498                 info.tcpi_unacked = tp->packets_out;
2499                 info.tcpi_sacked = tp->sacked_out;
2500                 info.tcpi_lost = tp->lost_out;
2501                 info.tcpi_retrans = tp->retrans_out;
2502                 info.tcpi_fackets = tp->fackets_out;
2503
2504                 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2505                 info.tcpi_last_ack_sent = 0;
2506                 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2507                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2508
2509                 info.tcpi_pmtu = tp->pmtu_cookie;
2510                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2511                 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2512                 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2513                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2514                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2515                 info.tcpi_advmss = tp->advmss;
2516                 info.tcpi_reordering = tp->reordering;
2517
2518                 len = min_t(unsigned int, len, sizeof(info));
2519                 if(put_user(len, optlen))
2520                         return -EFAULT;
2521                 if(copy_to_user(optval, &info,len))
2522                         return -EFAULT;
2523                 return 0;
2524         }
2525         case TCP_QUICKACK:
2526                 val = !tp->ack.pingpong;
2527                 break;
2528         default:
2529                 return -ENOPROTOOPT;
2530         };
2531
2532         if(put_user(len, optlen))
2533                 return -EFAULT;
2534         if(copy_to_user(optval, &val,len))
2535                 return -EFAULT;
2536         return 0;
2537 }
2538
2539
2540 extern void __skb_cb_too_small_for_tcp(int, int);
2541 extern void tcpdiag_init(void);
2542
2543 void __init tcp_init(void)
2544 {
2545         struct sk_buff *skb = NULL;
2546         unsigned long goal;
2547         int order, i;
2548
2549         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2550                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2551                                            sizeof(skb->cb));
2552
2553         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2554                                                    sizeof(struct open_request),
2555                                                0, SLAB_HWCACHE_ALIGN,
2556                                                NULL, NULL);
2557         if(!tcp_openreq_cachep)
2558                 panic("tcp_init: Cannot alloc open_request cache.");
2559
2560         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2561                                               sizeof(struct tcp_bind_bucket),
2562                                               0, SLAB_HWCACHE_ALIGN,
2563                                               NULL, NULL);
2564         if(!tcp_bucket_cachep)
2565                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2566
2567         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2568                                                 sizeof(struct tcp_tw_bucket),
2569                                                 0, SLAB_HWCACHE_ALIGN,
2570                                                 NULL, NULL);
2571         if(!tcp_timewait_cachep)
2572                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2573
2574         /* Size and allocate the main established and bind bucket
2575          * hash tables.
2576          *
2577          * The methodology is similar to that of the buffer cache.
2578          */
2579         if (num_physpages >= (128 * 1024))
2580                 goal = num_physpages >> (21 - PAGE_SHIFT);
2581         else
2582                 goal = num_physpages >> (23 - PAGE_SHIFT);
2583
2584         for(order = 0; (1UL << order) < goal; order++)
2585                 ;
2586         do {
2587                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2588                         sizeof(struct tcp_ehash_bucket);
2589                 tcp_ehash_size >>= 1;
2590                 while (tcp_ehash_size & (tcp_ehash_size-1))
2591                         tcp_ehash_size--;
2592                 tcp_ehash = (struct tcp_ehash_bucket *)
2593                         __get_free_pages(GFP_ATOMIC, order);
2594         } while (tcp_ehash == NULL && --order > 0);
2595
2596         if (!tcp_ehash)
2597                 panic("Failed to allocate TCP established hash table\n");
2598         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2599                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2600                 tcp_ehash[i].chain = NULL;
2601         }
2602
2603         do {
2604                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2605                         sizeof(struct tcp_bind_hashbucket);
2606                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2607                         continue;
2608                 tcp_bhash = (struct tcp_bind_hashbucket *)
2609                         __get_free_pages(GFP_ATOMIC, order);
2610         } while (tcp_bhash == NULL && --order >= 0);
2611
2612         if (!tcp_bhash)
2613                 panic("Failed to allocate TCP bind hash table\n");
2614         for (i = 0; i < tcp_bhash_size; i++) {
2615                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2616                 tcp_bhash[i].chain = NULL;
2617         }
2618
2619         /* Try to be a bit smarter and adjust defaults depending
2620          * on available memory.
2621          */
2622         if (order > 4) {
2623                 sysctl_local_port_range[0] = 32768;
2624                 sysctl_local_port_range[1] = 61000;
2625                 sysctl_tcp_max_tw_buckets = 180000;
2626                 sysctl_tcp_max_orphans = 4096<<(order-4);
2627                 sysctl_max_syn_backlog = 1024;
2628         } else if (order < 3) {
2629                 sysctl_local_port_range[0] = 1024*(3-order);
2630                 sysctl_tcp_max_tw_buckets >>= (3-order);
2631                 sysctl_tcp_max_orphans >>= (3-order);
2632                 sysctl_max_syn_backlog = 128;
2633         }
2634         tcp_port_rover = sysctl_local_port_range[0] - 1;
2635
2636         sysctl_tcp_mem[0] = 768<<order;
2637         sysctl_tcp_mem[1] = 1024<<order;
2638         sysctl_tcp_mem[2] = 1536<<order;
2639
2640         if (order < 3) {
2641                 sysctl_tcp_wmem[2] = 64*1024;
2642                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2643                 sysctl_tcp_rmem[1] = 43689;
2644                 sysctl_tcp_rmem[2] = 2*43689;
2645         }
2646
2647         printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2648                tcp_ehash_size<<1, tcp_bhash_size);
2649
2650         (void) tcp_mib_init();
2651         tcpdiag_init();
2652 }