net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/kernel.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/fs.h>
 257 #include <linux/skbuff.h>
 258 #include <linux/splice.h>
 259 #include <linux/net.h>
 260 #include <linux/socket.h>
 261 #include <linux/random.h>
 262 #include <linux/bootmem.h>
 263 #include <linux/cache.h>
 264 #include <linux/err.h>
 265 #include <linux/crypto.h>
 266
 267 #include <net/icmp.h>
 268 #include <net/tcp.h>
 269 #include <net/xfrm.h>
 270 #include <net/ip.h>
 271 #include <net/netdma.h>
 272 #include <net/sock.h>
 273
 274 #include <asm/uaccess.h>
 275 #include <asm/ioctls.h>
 276
 277 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 278
 279 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 280
 281 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 282
 283 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 284
 285 int sysctl_tcp_mem[3] __read_mostly;
 286 int sysctl_tcp_wmem[3] __read_mostly;
 287 int sysctl_tcp_rmem[3] __read_mostly;
 288
 289 EXPORT_SYMBOL(sysctl_tcp_mem);
 290 EXPORT_SYMBOL(sysctl_tcp_rmem);
 291 EXPORT_SYMBOL(sysctl_tcp_wmem);
 292
 293 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 294 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 295
 296 EXPORT_SYMBOL(tcp_memory_allocated);
 297 EXPORT_SYMBOL(tcp_sockets_allocated);
 298
 299 /*
 300  * TCP splice context
 301  */
 302 struct tcp_splice_state {
 303         struct pipe_inode_info *pipe;
 304         size_t len;
 305         unsigned int flags;
 306 };
 307
 308 /*
 309  * Pressure flag: try to collapse.
 310  * Technical note: it is used by multiple contexts non atomically.
 311  * All the sk_stream_mem_schedule() is of this nature: accounting
 312  * is strict, actions are advisory and have some latency.
 313  */
 314 int tcp_memory_pressure __read_mostly;
 315
 316 EXPORT_SYMBOL(tcp_memory_pressure);
 317
 318 void tcp_enter_memory_pressure(void)
 319 {
 320         if (!tcp_memory_pressure) {
 321                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 322                 tcp_memory_pressure = 1;
 323         }
 324 }
 325
 326 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 327
 328 /*
 329  *      Wait for a TCP event.
 330  *
 331  *      Note that we don't need to lock the socket, as the upper poll layers
 332  *      take care of normal races (between the test and the event) and we don't
 333  *      go look at any of the socket buffers directly.
 334  */
 335 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 336 {
 337         unsigned int mask;
 338         struct sock *sk = sock->sk;
 339         struct tcp_sock *tp = tcp_sk(sk);
 340
 341         poll_wait(file, sk->sk_sleep, wait);
 342         if (sk->sk_state == TCP_LISTEN)
 343                 return inet_csk_listen_poll(sk);
 344
 345         /* Socket is not locked. We are protected from async events
 346            by poll logic and correct handling of state changes
 347            made by another threads is impossible in any case.
 348          */
 349
 350         mask = 0;
 351         if (sk->sk_err)
 352                 mask = POLLERR;
 353
 354         /*
 355          * POLLHUP is certainly not done right. But poll() doesn't
 356          * have a notion of HUP in just one direction, and for a
 357          * socket the read side is more interesting.
 358          *
 359          * Some poll() documentation says that POLLHUP is incompatible
 360          * with the POLLOUT/POLLWR flags, so somebody should check this
 361          * all. But careful, it tends to be safer to return too many
 362          * bits than too few, and you can easily break real applications
 363          * if you don't tell them that something has hung up!
 364          *
 365          * Check-me.
 366          *
 367          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 368          * our fs/select.c). It means that after we received EOF,
 369          * poll always returns immediately, making impossible poll() on write()
 370          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 371          * if and only if shutdown has been made in both directions.
 372          * Actually, it is interesting to look how Solaris and DUX
 373          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 374          * then we could set it on SND_SHUTDOWN. BTW examples given
 375          * in Stevens' books assume exactly this behaviour, it explains
 376          * why PULLHUP is incompatible with POLLOUT.    --ANK
 377          *
 378          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 379          * blocking on fresh not-connected or disconnected socket. --ANK
 380          */
 381         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 382                 mask |= POLLHUP;
 383         if (sk->sk_shutdown & RCV_SHUTDOWN)
 384                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 385
 386         /* Connected? */
 387         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 388                 /* Potential race condition. If read of tp below will
 389                  * escape above sk->sk_state, we can be illegally awaken
 390                  * in SYN_* states. */
 391                 if ((tp->rcv_nxt != tp->copied_seq) &&
 392                     (tp->urg_seq != tp->copied_seq ||
 393                      tp->rcv_nxt != tp->copied_seq + 1 ||
 394                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 395                         mask |= POLLIN | POLLRDNORM;
 396
 397                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 398                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 399                                 mask |= POLLOUT | POLLWRNORM;
 400                         } else {  /* send SIGIO later */
 401                                 set_bit(SOCK_ASYNC_NOSPACE,
 402                                         &sk->sk_socket->flags);
 403                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 404
 405                                 /* Race breaker. If space is freed after
 406                                  * wspace test but before the flags are set,
 407                                  * IO signal will be lost.
 408                                  */
 409                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 410                                         mask |= POLLOUT | POLLWRNORM;
 411                         }
 412                 }
 413
 414                 if (tp->urg_data & TCP_URG_VALID)
 415                         mask |= POLLPRI;
 416         }
 417         return mask;
 418 }
 419
 420 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 421 {
 422         struct tcp_sock *tp = tcp_sk(sk);
 423         int answ;
 424
 425         switch (cmd) {
 426         case SIOCINQ:
 427                 if (sk->sk_state == TCP_LISTEN)
 428                         return -EINVAL;
 429
 430                 lock_sock(sk);
 431                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 432                         answ = 0;
 433                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 434                          !tp->urg_data ||
 435                          before(tp->urg_seq, tp->copied_seq) ||
 436                          !before(tp->urg_seq, tp->rcv_nxt)) {
 437                         answ = tp->rcv_nxt - tp->copied_seq;
 438
 439                         /* Subtract 1, if FIN is in queue. */
 440                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 441                                 answ -=
 442                        tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
 443                 } else
 444                         answ = tp->urg_seq - tp->copied_seq;
 445                 release_sock(sk);
 446                 break;
 447         case SIOCATMARK:
 448                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 449                 break;
 450         case SIOCOUTQ:
 451                 if (sk->sk_state == TCP_LISTEN)
 452                         return -EINVAL;
 453
 454                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 455                         answ = 0;
 456                 else
 457                         answ = tp->write_seq - tp->snd_una;
 458                 break;
 459         default:
 460                 return -ENOIOCTLCMD;
 461         }
 462
 463         return put_user(answ, (int __user *)arg);
 464 }
 465
 466 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 467 {
 468         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 469         tp->pushed_seq = tp->write_seq;
 470 }
 471
 472 static inline int forced_push(struct tcp_sock *tp)
 473 {
 474         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 475 }
 476
 477 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 478 {
 479         struct tcp_sock *tp = tcp_sk(sk);
 480         struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 481
 482         skb->csum    = 0;
 483         tcb->seq     = tcb->end_seq = tp->write_seq;
 484         tcb->flags   = TCPCB_FLAG_ACK;
 485         tcb->sacked  = 0;
 486         skb_header_release(skb);
 487         tcp_add_write_queue_tail(sk, skb);
 488         sk_charge_skb(sk, skb);
 489         if (tp->nonagle & TCP_NAGLE_PUSH)
 490                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 491 }
 492
 493 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 494                                 struct sk_buff *skb)
 495 {
 496         if (flags & MSG_OOB) {
 497                 tp->urg_mode = 1;
 498                 tp->snd_up = tp->write_seq;
 499                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 500         }
 501 }
 502
 503 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
 504                             int nonagle)
 505 {
 506         struct tcp_sock *tp = tcp_sk(sk);
 507
 508         if (tcp_send_head(sk)) {
 509                 struct sk_buff *skb = tcp_write_queue_tail(sk);
 510                 if (!(flags & MSG_MORE) || forced_push(tp))
 511                         tcp_mark_push(tp, skb);
 512                 tcp_mark_urg(tp, flags, skb);
 513                 __tcp_push_pending_frames(sk, mss_now,
 514                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 515         }
 516 }
 517
 518 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 519                                 unsigned int offset, size_t len)
 520 {
 521         struct tcp_splice_state *tss = rd_desc->arg.data;
 522
 523         return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
 524 }
 525
 526 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 527 {
 528         /* Store TCP splice context information in read_descriptor_t. */
 529         read_descriptor_t rd_desc = {
 530                 .arg.data = tss,
 531         };
 532
 533         return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 534 }
 535
 536 /**
 537  *  tcp_splice_read - splice data from TCP socket to a pipe
 538  * @sock:       socket to splice from
 539  * @ppos:       position (not valid)
 540  * @pipe:       pipe to splice to
 541  * @len:        number of bytes to splice
 542  * @flags:      splice modifier flags
 543  *
 544  * Description:
 545  *    Will read pages from given socket and fill them into a pipe.
 546  *
 547  **/
 548 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 549                         struct pipe_inode_info *pipe, size_t len,
 550                         unsigned int flags)
 551 {
 552         struct sock *sk = sock->sk;
 553         struct tcp_splice_state tss = {
 554                 .pipe = pipe,
 555                 .len = len,
 556                 .flags = flags,
 557         };
 558         long timeo;
 559         ssize_t spliced;
 560         int ret;
 561
 562         /*
 563          * We can't seek on a socket input
 564          */
 565         if (unlikely(*ppos))
 566                 return -ESPIPE;
 567
 568         ret = spliced = 0;
 569
 570         lock_sock(sk);
 571
 572         timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
 573         while (tss.len) {
 574                 ret = __tcp_splice_read(sk, &tss);
 575                 if (ret < 0)
 576                         break;
 577                 else if (!ret) {
 578                         if (spliced)
 579                                 break;
 580                         if (flags & SPLICE_F_NONBLOCK) {
 581                                 ret = -EAGAIN;
 582                                 break;
 583                         }
 584                         if (sock_flag(sk, SOCK_DONE))
 585                                 break;
 586                         if (sk->sk_err) {
 587                                 ret = sock_error(sk);
 588                                 break;
 589                         }
 590                         if (sk->sk_shutdown & RCV_SHUTDOWN)
 591                                 break;
 592                         if (sk->sk_state == TCP_CLOSE) {
 593                                 /*
 594                                  * This occurs when user tries to read
 595                                  * from never connected socket.
 596                                  */
 597                                 if (!sock_flag(sk, SOCK_DONE))
 598                                         ret = -ENOTCONN;
 599                                 break;
 600                         }
 601                         if (!timeo) {
 602                                 ret = -EAGAIN;
 603                                 break;
 604                         }
 605                         sk_wait_data(sk, &timeo);
 606                         if (signal_pending(current)) {
 607                                 ret = sock_intr_errno(timeo);
 608                                 break;
 609                         }
 610                         continue;
 611                 }
 612                 tss.len -= ret;
 613                 spliced += ret;
 614
 615                 release_sock(sk);
 616                 lock_sock(sk);
 617
 618                 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
 619                     (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
 620                     signal_pending(current))
 621                         break;
 622         }
 623
 624         release_sock(sk);
 625
 626         if (spliced)
 627                 return spliced;
 628
 629         return ret;
 630 }
 631
 632 struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
 633                 int size, int mem, gfp_t gfp)
 634 {
 635         struct sk_buff *skb;
 636
 637         /* The TCP header must be at least 32-bit aligned.  */
 638         size = ALIGN(size, 4);
 639
 640         skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 641         if (skb) {
 642                 skb->truesize += mem;
 643                 if (sk_stream_wmem_schedule(sk, skb->truesize)) {
 644                         /*
 645                          * Make sure that we have exactly size bytes
 646                          * available to the caller, no more, no less.
 647                          */
 648                         skb_reserve(skb, skb_tailroom(skb) - size);
 649                         return skb;
 650                 }
 651                 __kfree_skb(skb);
 652         } else {
 653                 sk->sk_prot->enter_memory_pressure();
 654                 sk_stream_moderate_sndbuf(sk);
 655         }
 656         return NULL;
 657 }
 658
 659 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 660                          size_t psize, int flags)
 661 {
 662         struct tcp_sock *tp = tcp_sk(sk);
 663         int mss_now, size_goal;
 664         int err;
 665         ssize_t copied;
 666         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 667
 668         /* Wait for a connection to finish. */
 669         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 670                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 671                         goto out_err;
 672
 673         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 674
 675         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 676         size_goal = tp->xmit_size_goal;
 677         copied = 0;
 678
 679         err = -EPIPE;
 680         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 681                 goto do_error;
 682
 683         while (psize > 0) {
 684                 struct sk_buff *skb = tcp_write_queue_tail(sk);
 685                 struct page *page = pages[poffset / PAGE_SIZE];
 686                 int copy, i, can_coalesce;
 687                 int offset = poffset % PAGE_SIZE;
 688                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 689
 690                 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
 691 new_segment:
 692                         if (!sk_stream_memory_free(sk))
 693                                 goto wait_for_sndbuf;
 694
 695                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 696                                                    sk->sk_allocation);
 697                         if (!skb)
 698                                 goto wait_for_memory;
 699
 700                         skb_entail(sk, skb);
 701                         copy = size_goal;
 702                 }
 703
 704                 if (copy > size)
 705                         copy = size;
 706
 707                 i = skb_shinfo(skb)->nr_frags;
 708                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 709                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 710                         tcp_mark_push(tp, skb);
 711                         goto new_segment;
 712                 }
 713                 if (!sk_stream_wmem_schedule(sk, copy))
 714                         goto wait_for_memory;
 715
 716                 if (can_coalesce) {
 717                         skb_shinfo(skb)->frags[i - 1].size += copy;
 718                 } else {
 719                         get_page(page);
 720                         skb_fill_page_desc(skb, i, page, offset, copy);
 721                 }
 722
 723                 skb->len += copy;
 724                 skb->data_len += copy;
 725                 skb->truesize += copy;
 726                 sk->sk_wmem_queued += copy;
 727                 sk->sk_forward_alloc -= copy;
 728                 skb->ip_summed = CHECKSUM_PARTIAL;
 729                 tp->write_seq += copy;
 730                 TCP_SKB_CB(skb)->end_seq += copy;
 731                 skb_shinfo(skb)->gso_segs = 0;
 732
 733                 if (!copied)
 734                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 735
 736                 copied += copy;
 737                 poffset += copy;
 738                 if (!(psize -= copy))
 739                         goto out;
 740
 741                 if (skb->len < mss_now || (flags & MSG_OOB))
 742                         continue;
 743
 744                 if (forced_push(tp)) {
 745                         tcp_mark_push(tp, skb);
 746                         __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 747                 } else if (skb == tcp_send_head(sk))
 748                         tcp_push_one(sk, mss_now);
 749                 continue;
 750
 751 wait_for_sndbuf:
 752                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 753 wait_for_memory:
 754                 if (copied)
 755                         tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 756
 757                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 758                         goto do_error;
 759
 760                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 761                 size_goal = tp->xmit_size_goal;
 762         }
 763
 764 out:
 765         if (copied)
 766                 tcp_push(sk, flags, mss_now, tp->nonagle);
 767         return copied;
 768
 769 do_error:
 770         if (copied)
 771                 goto out;
 772 out_err:
 773         return sk_stream_error(sk, flags, err);
 774 }
 775
 776 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 777                      size_t size, int flags)
 778 {
 779         ssize_t res;
 780         struct sock *sk = sock->sk;
 781
 782         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 783             !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 784                 return sock_no_sendpage(sock, page, offset, size, flags);
 785
 786         lock_sock(sk);
 787         TCP_CHECK_TIMER(sk);
 788         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 789         TCP_CHECK_TIMER(sk);
 790         release_sock(sk);
 791         return res;
 792 }
 793
 794 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 795 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 796
 797 static inline int select_size(struct sock *sk)
 798 {
 799         struct tcp_sock *tp = tcp_sk(sk);
 800         int tmp = tp->mss_cache;
 801
 802         if (sk->sk_route_caps & NETIF_F_SG) {
 803                 if (sk_can_gso(sk))
 804                         tmp = 0;
 805                 else {
 806                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 807
 808                         if (tmp >= pgbreak &&
 809                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 810                                 tmp = pgbreak;
 811                 }
 812         }
 813
 814         return tmp;
 815 }
 816
 817 int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 818                 size_t size)
 819 {
 820         struct sock *sk = sock->sk;
 821         struct iovec *iov;
 822         struct tcp_sock *tp = tcp_sk(sk);
 823         struct sk_buff *skb;
 824         int iovlen, flags;
 825         int mss_now, size_goal;
 826         int err, copied;
 827         long timeo;
 828
 829         lock_sock(sk);
 830         TCP_CHECK_TIMER(sk);
 831
 832         flags = msg->msg_flags;
 833         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 834
 835         /* Wait for a connection to finish. */
 836         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 837                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 838                         goto out_err;
 839
 840         /* This should be in poll */
 841         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 842
 843         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 844         size_goal = tp->xmit_size_goal;
 845
 846         /* Ok commence sending. */
 847         iovlen = msg->msg_iovlen;
 848         iov = msg->msg_iov;
 849         copied = 0;
 850
 851         err = -EPIPE;
 852         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 853                 goto do_error;
 854
 855         while (--iovlen >= 0) {
 856                 int seglen = iov->iov_len;
 857                 unsigned char __user *from = iov->iov_base;
 858
 859                 iov++;
 860
 861                 while (seglen > 0) {
 862                         int copy;
 863
 864                         skb = tcp_write_queue_tail(sk);
 865
 866                         if (!tcp_send_head(sk) ||
 867                             (copy = size_goal - skb->len) <= 0) {
 868
 869 new_segment:
 870                                 /* Allocate new segment. If the interface is SG,
 871                                  * allocate skb fitting to single page.
 872                                  */
 873                                 if (!sk_stream_memory_free(sk))
 874                                         goto wait_for_sndbuf;
 875
 876                                 skb = sk_stream_alloc_pskb(sk, select_size(sk),
 877                                                            0, sk->sk_allocation);
 878                                 if (!skb)
 879                                         goto wait_for_memory;
 880
 881                                 /*
 882                                  * Check whether we can use HW checksum.
 883                                  */
 884                                 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 885                                         skb->ip_summed = CHECKSUM_PARTIAL;
 886
 887                                 skb_entail(sk, skb);
 888                                 copy = size_goal;
 889                         }
 890
 891                         /* Try to append data to the end of skb. */
 892                         if (copy > seglen)
 893                                 copy = seglen;
 894
 895                         /* Where to copy to? */
 896                         if (skb_tailroom(skb) > 0) {
 897                                 /* We have some space in skb head. Superb! */
 898                                 if (copy > skb_tailroom(skb))
 899                                         copy = skb_tailroom(skb);
 900                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 901                                         goto do_fault;
 902                         } else {
 903                                 int merge = 0;
 904                                 int i = skb_shinfo(skb)->nr_frags;
 905                                 struct page *page = TCP_PAGE(sk);
 906                                 int off = TCP_OFF(sk);
 907
 908                                 if (skb_can_coalesce(skb, i, page, off) &&
 909                                     off != PAGE_SIZE) {
 910                                         /* We can extend the last page
 911                                          * fragment. */
 912                                         merge = 1;
 913                                 } else if (i == MAX_SKB_FRAGS ||
 914                                            (!i &&
 915                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 916                                         /* Need to add new fragment and cannot
 917                                          * do this because interface is non-SG,
 918                                          * or because all the page slots are
 919                                          * busy. */
 920                                         tcp_mark_push(tp, skb);
 921                                         goto new_segment;
 922                                 } else if (page) {
 923                                         if (off == PAGE_SIZE) {
 924                                                 put_page(page);
 925                                                 TCP_PAGE(sk) = page = NULL;
 926                                                 off = 0;
 927                                         }
 928                                 } else
 929                                         off = 0;
 930
 931                                 if (copy > PAGE_SIZE - off)
 932                                         copy = PAGE_SIZE - off;
 933
 934                                 if (!sk_stream_wmem_schedule(sk, copy))
 935                                         goto wait_for_memory;
 936
 937                                 if (!page) {
 938                                         /* Allocate new cache page. */
 939                                         if (!(page = sk_stream_alloc_page(sk)))
 940                                                 goto wait_for_memory;
 941                                 }
 942
 943                                 /* Time to copy data. We are close to
 944                                  * the end! */
 945                                 err = skb_copy_to_page(sk, from, skb, page,
 946                                                        off, copy);
 947                                 if (err) {
 948                                         /* If this page was new, give it to the
 949                                          * socket so it does not get leaked.
 950                                          */
 951                                         if (!TCP_PAGE(sk)) {
 952                                                 TCP_PAGE(sk) = page;
 953                                                 TCP_OFF(sk) = 0;
 954                                         }
 955                                         goto do_error;
 956                                 }
 957
 958                                 /* Update the skb. */
 959                                 if (merge) {
 960                                         skb_shinfo(skb)->frags[i - 1].size +=
 961                                                                         copy;
 962                                 } else {
 963                                         skb_fill_page_desc(skb, i, page, off, copy);
 964                                         if (TCP_PAGE(sk)) {
 965                                                 get_page(page);
 966                                         } else if (off + copy < PAGE_SIZE) {
 967                                                 get_page(page);
 968                                                 TCP_PAGE(sk) = page;
 969                                         }
 970                                 }
 971
 972                                 TCP_OFF(sk) = off + copy;
 973                         }
 974
 975                         if (!copied)
 976                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 977
 978                         tp->write_seq += copy;
 979                         TCP_SKB_CB(skb)->end_seq += copy;
 980                         skb_shinfo(skb)->gso_segs = 0;
 981
 982                         from += copy;
 983                         copied += copy;
 984                         if ((seglen -= copy) == 0 && iovlen == 0)
 985                                 goto out;
 986
 987                         if (skb->len < mss_now || (flags & MSG_OOB))
 988                                 continue;
 989
 990                         if (forced_push(tp)) {
 991                                 tcp_mark_push(tp, skb);
 992                                 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 993                         } else if (skb == tcp_send_head(sk))
 994                                 tcp_push_one(sk, mss_now);
 995                         continue;
 996
 997 wait_for_sndbuf:
 998                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 999 wait_for_memory:
1000                         if (copied)
1001                                 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1002
1003                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1004                                 goto do_error;
1005
1006                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1007                         size_goal = tp->xmit_size_goal;
1008                 }
1009         }
1010
1011 out:
1012         if (copied)
1013                 tcp_push(sk, flags, mss_now, tp->nonagle);
1014         TCP_CHECK_TIMER(sk);
1015         release_sock(sk);
1016         return copied;
1017
1018 do_fault:
1019         if (!skb->len) {
1020                 tcp_unlink_write_queue(skb, sk);
1021                 /* It is the one place in all of TCP, except connection
1022                  * reset, where we can be unlinking the send_head.
1023                  */
1024                 tcp_check_send_head(sk, skb);
1025                 sk_stream_free_skb(sk, skb);
1026         }
1027
1028 do_error:
1029         if (copied)
1030                 goto out;
1031 out_err:
1032         err = sk_stream_error(sk, flags, err);
1033         TCP_CHECK_TIMER(sk);
1034         release_sock(sk);
1035         return err;
1036 }
1037
1038 /*
1039  *      Handle reading urgent data. BSD has very simple semantics for
1040  *      this, no blocking and very strange errors 8)
1041  */
1042
1043 static int tcp_recv_urg(struct sock *sk, long timeo,
1044                         struct msghdr *msg, int len, int flags,
1045                         int *addr_len)
1046 {
1047         struct tcp_sock *tp = tcp_sk(sk);
1048
1049         /* No URG data to read. */
1050         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1051             tp->urg_data == TCP_URG_READ)
1052                 return -EINVAL; /* Yes this is right ! */
1053
1054         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1055                 return -ENOTCONN;
1056
1057         if (tp->urg_data & TCP_URG_VALID) {
1058                 int err = 0;
1059                 char c = tp->urg_data;
1060
1061                 if (!(flags & MSG_PEEK))
1062                         tp->urg_data = TCP_URG_READ;
1063
1064                 /* Read urgent data. */
1065                 msg->msg_flags |= MSG_OOB;
1066
1067                 if (len > 0) {
1068                         if (!(flags & MSG_TRUNC))
1069                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1070                         len = 1;
1071                 } else
1072                         msg->msg_flags |= MSG_TRUNC;
1073
1074                 return err ? -EFAULT : len;
1075         }
1076
1077         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1078                 return 0;
1079
1080         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1081          * the available implementations agree in this case:
1082          * this call should never block, independent of the
1083          * blocking state of the socket.
1084          * Mike <pall@rz.uni-karlsruhe.de>
1085          */
1086         return -EAGAIN;
1087 }
1088
1089 /* Clean up the receive buffer for full frames taken by the user,
1090  * then send an ACK if necessary.  COPIED is the number of bytes
1091  * tcp_recvmsg has given to the user so far, it speeds up the
1092  * calculation of whether or not we must ACK for the sake of
1093  * a window update.
1094  */
1095 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1096 {
1097         struct tcp_sock *tp = tcp_sk(sk);
1098         int time_to_ack = 0;
1099
1100 #if TCP_DEBUG
1101         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1102
1103         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1104 #endif
1105
1106         if (inet_csk_ack_scheduled(sk)) {
1107                 const struct inet_connection_sock *icsk = inet_csk(sk);
1108                    /* Delayed ACKs frequently hit locked sockets during bulk
1109                     * receive. */
1110                 if (icsk->icsk_ack.blocked ||
1111                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1112                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1113                     /*
1114                      * If this read emptied read buffer, we send ACK, if
1115                      * connection is not bidirectional, user drained
1116                      * receive buffer and there was a small segment
1117                      * in queue.
1118                      */
1119                     (copied > 0 &&
1120                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1121                       ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1122                        !icsk->icsk_ack.pingpong)) &&
1123                       !atomic_read(&sk->sk_rmem_alloc)))
1124                         time_to_ack = 1;
1125         }
1126
1127         /* We send an ACK if we can now advertise a non-zero window
1128          * which has been raised "significantly".
1129          *
1130          * Even if window raised up to infinity, do not send window open ACK
1131          * in states, where we will not receive more. It is useless.
1132          */
1133         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1134                 __u32 rcv_window_now = tcp_receive_window(tp);
1135
1136                 /* Optimize, __tcp_select_window() is not cheap. */
1137                 if (2*rcv_window_now <= tp->window_clamp) {
1138                         __u32 new_window = __tcp_select_window(sk);
1139
1140                         /* Send ACK now, if this read freed lots of space
1141                          * in our buffer. Certainly, new_window is new window.
1142                          * We can advertise it now, if it is not less than current one.
1143                          * "Lots" means "at least twice" here.
1144                          */
1145                         if (new_window && new_window >= 2 * rcv_window_now)
1146                                 time_to_ack = 1;
1147                 }
1148         }
1149         if (time_to_ack)
1150                 tcp_send_ack(sk);
1151 }
1152
1153 static void tcp_prequeue_process(struct sock *sk)
1154 {
1155         struct sk_buff *skb;
1156         struct tcp_sock *tp = tcp_sk(sk);
1157
1158         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1159
1160         /* RX process wants to run with disabled BHs, though it is not
1161          * necessary */
1162         local_bh_disable();
1163         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1164                 sk->sk_backlog_rcv(sk, skb);
1165         local_bh_enable();
1166
1167         /* Clear memory counter. */
1168         tp->ucopy.memory = 0;
1169 }
1170
1171 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1172 {
1173         struct sk_buff *skb;
1174         u32 offset;
1175
1176         skb_queue_walk(&sk->sk_receive_queue, skb) {
1177                 offset = seq - TCP_SKB_CB(skb)->seq;
1178                 if (tcp_hdr(skb)->syn)
1179                         offset--;
1180                 if (offset < skb->len || tcp_hdr(skb)->fin) {
1181                         *off = offset;
1182                         return skb;
1183                 }
1184         }
1185         return NULL;
1186 }
1187
1188 /*
1189  * This routine provides an alternative to tcp_recvmsg() for routines
1190  * that would like to handle copying from skbuffs directly in 'sendfile'
1191  * fashion.
1192  * Note:
1193  *      - It is assumed that the socket was locked by the caller.
1194  *      - The routine does not block.
1195  *      - At present, there is no support for reading OOB data
1196  *        or for 'peeking' the socket using this routine
1197  *        (although both would be easy to implement).
1198  */
1199 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1200                   sk_read_actor_t recv_actor)
1201 {
1202         struct sk_buff *skb;
1203         struct tcp_sock *tp = tcp_sk(sk);
1204         u32 seq = tp->copied_seq;
1205         u32 offset;
1206         int copied = 0;
1207
1208         if (sk->sk_state == TCP_LISTEN)
1209                 return -ENOTCONN;
1210         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1211                 if (offset < skb->len) {
1212                         size_t used, len;
1213
1214                         len = skb->len - offset;
1215                         /* Stop reading if we hit a patch of urgent data */
1216                         if (tp->urg_data) {
1217                                 u32 urg_offset = tp->urg_seq - seq;
1218                                 if (urg_offset < len)
1219                                         len = urg_offset;
1220                                 if (!len)
1221                                         break;
1222                         }
1223                         used = recv_actor(desc, skb, offset, len);
1224                         if (used < 0) {
1225                                 if (!copied)
1226                                         copied = used;
1227                                 break;
1228                         } else if (used <= len) {
1229                                 seq += used;
1230                                 copied += used;
1231                                 offset += used;
1232                         }
1233                         if (offset != skb->len)
1234                                 break;
1235                 }
1236                 if (tcp_hdr(skb)->fin) {
1237                         sk_eat_skb(sk, skb, 0);
1238                         ++seq;
1239                         break;
1240                 }
1241                 sk_eat_skb(sk, skb, 0);
1242                 if (!desc->count)
1243                         break;
1244         }
1245         tp->copied_seq = seq;
1246
1247         tcp_rcv_space_adjust(sk);
1248
1249         /* Clean up data we have read: This will do ACK frames. */
1250         if (copied > 0)
1251                 tcp_cleanup_rbuf(sk, copied);
1252         return copied;
1253 }
1254
1255 /*
1256  *      This routine copies from a sock struct into the user buffer.
1257  *
1258  *      Technical note: in 2.3 we work on _locked_ socket, so that
1259  *      tricks with *seq access order and skb->users are not required.
1260  *      Probably, code can be easily improved even more.
1261  */
1262
1263 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1264                 size_t len, int nonblock, int flags, int *addr_len)
1265 {
1266         struct tcp_sock *tp = tcp_sk(sk);
1267         int copied = 0;
1268         u32 peek_seq;
1269         u32 *seq;
1270         unsigned long used;
1271         int err;
1272         int target;             /* Read at least this many bytes */
1273         long timeo;
1274         struct task_struct *user_recv = NULL;
1275         int copied_early = 0;
1276         struct sk_buff *skb;
1277
1278         lock_sock(sk);
1279
1280         TCP_CHECK_TIMER(sk);
1281
1282         err = -ENOTCONN;
1283         if (sk->sk_state == TCP_LISTEN)
1284                 goto out;
1285
1286         timeo = sock_rcvtimeo(sk, nonblock);
1287
1288         /* Urgent data needs to be handled specially. */
1289         if (flags & MSG_OOB)
1290                 goto recv_urg;
1291
1292         seq = &tp->copied_seq;
1293         if (flags & MSG_PEEK) {
1294                 peek_seq = tp->copied_seq;
1295                 seq = &peek_seq;
1296         }
1297
1298         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1299
1300 #ifdef CONFIG_NET_DMA
1301         tp->ucopy.dma_chan = NULL;
1302         preempt_disable();
1303         skb = skb_peek_tail(&sk->sk_receive_queue);
1304         {
1305                 int available = 0;
1306
1307                 if (skb)
1308                         available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1309                 if ((available < target) &&
1310                     (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1311                     !sysctl_tcp_low_latency &&
1312                     __get_cpu_var(softnet_data).net_dma) {
1313                         preempt_enable_no_resched();
1314                         tp->ucopy.pinned_list =
1315                                         dma_pin_iovec_pages(msg->msg_iov, len);
1316                 } else {
1317                         preempt_enable_no_resched();
1318                 }
1319         }
1320 #endif
1321
1322         do {
1323                 u32 offset;
1324
1325                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1326                 if (tp->urg_data && tp->urg_seq == *seq) {
1327                         if (copied)
1328                                 break;
1329                         if (signal_pending(current)) {
1330                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1331                                 break;
1332                         }
1333                 }
1334
1335                 /* Next get a buffer. */
1336
1337                 skb = skb_peek(&sk->sk_receive_queue);
1338                 do {
1339                         if (!skb)
1340                                 break;
1341
1342                         /* Now that we have two receive queues this
1343                          * shouldn't happen.
1344                          */
1345                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1346                                 printk(KERN_INFO "recvmsg bug: copied %X "
1347                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1348                                 break;
1349                         }
1350                         offset = *seq - TCP_SKB_CB(skb)->seq;
1351                         if (tcp_hdr(skb)->syn)
1352                                 offset--;
1353                         if (offset < skb->len)
1354                                 goto found_ok_skb;
1355                         if (tcp_hdr(skb)->fin)
1356                                 goto found_fin_ok;
1357                         BUG_TRAP(flags & MSG_PEEK);
1358                         skb = skb->next;
1359                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1360
1361                 /* Well, if we have backlog, try to process it now yet. */
1362
1363                 if (copied >= target && !sk->sk_backlog.tail)
1364                         break;
1365
1366                 if (copied) {
1367                         if (sk->sk_err ||
1368                             sk->sk_state == TCP_CLOSE ||
1369                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1370                             !timeo ||
1371                             signal_pending(current) ||
1372                             (flags & MSG_PEEK))
1373                                 break;
1374                 } else {
1375                         if (sock_flag(sk, SOCK_DONE))
1376                                 break;
1377
1378                         if (sk->sk_err) {
1379                                 copied = sock_error(sk);
1380                                 break;
1381                         }
1382
1383                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1384                                 break;
1385
1386                         if (sk->sk_state == TCP_CLOSE) {
1387                                 if (!sock_flag(sk, SOCK_DONE)) {
1388                                         /* This occurs when user tries to read
1389                                          * from never connected socket.
1390                                          */
1391                                         copied = -ENOTCONN;
1392                                         break;
1393                                 }
1394                                 break;
1395                         }
1396
1397                         if (!timeo) {
1398                                 copied = -EAGAIN;
1399                                 break;
1400                         }
1401
1402                         if (signal_pending(current)) {
1403                                 copied = sock_intr_errno(timeo);
1404                                 break;
1405                         }
1406                 }
1407
1408                 tcp_cleanup_rbuf(sk, copied);
1409
1410                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1411                         /* Install new reader */
1412                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1413                                 user_recv = current;
1414                                 tp->ucopy.task = user_recv;
1415                                 tp->ucopy.iov = msg->msg_iov;
1416                         }
1417
1418                         tp->ucopy.len = len;
1419
1420                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1421                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1422
1423                         /* Ugly... If prequeue is not empty, we have to
1424                          * process it before releasing socket, otherwise
1425                          * order will be broken at second iteration.
1426                          * More elegant solution is required!!!
1427                          *
1428                          * Look: we have the following (pseudo)queues:
1429                          *
1430                          * 1. packets in flight
1431                          * 2. backlog
1432                          * 3. prequeue
1433                          * 4. receive_queue
1434                          *
1435                          * Each queue can be processed only if the next ones
1436                          * are empty. At this point we have empty receive_queue.
1437                          * But prequeue _can_ be not empty after 2nd iteration,
1438                          * when we jumped to start of loop because backlog
1439                          * processing added something to receive_queue.
1440                          * We cannot release_sock(), because backlog contains
1441                          * packets arrived _after_ prequeued ones.
1442                          *
1443                          * Shortly, algorithm is clear --- to process all
1444                          * the queues in order. We could make it more directly,
1445                          * requeueing packets from backlog to prequeue, if
1446                          * is not empty. It is more elegant, but eats cycles,
1447                          * unfortunately.
1448                          */
1449                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1450                                 goto do_prequeue;
1451
1452                         /* __ Set realtime policy in scheduler __ */
1453                 }
1454
1455                 if (copied >= target) {
1456                         /* Do not sleep, just process backlog. */
1457                         release_sock(sk);
1458                         lock_sock(sk);
1459                 } else
1460                         sk_wait_data(sk, &timeo);
1461
1462 #ifdef CONFIG_NET_DMA
1463                 tp->ucopy.wakeup = 0;
1464 #endif
1465
1466                 if (user_recv) {
1467                         int chunk;
1468
1469                         /* __ Restore normal policy in scheduler __ */
1470
1471                         if ((chunk = len - tp->ucopy.len) != 0) {
1472                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1473                                 len -= chunk;
1474                                 copied += chunk;
1475                         }
1476
1477                         if (tp->rcv_nxt == tp->copied_seq &&
1478                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1479 do_prequeue:
1480                                 tcp_prequeue_process(sk);
1481
1482                                 if ((chunk = len - tp->ucopy.len) != 0) {
1483                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1484                                         len -= chunk;
1485                                         copied += chunk;
1486                                 }
1487                         }
1488                 }
1489                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1490                         if (net_ratelimit())
1491                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1492                                        current->comm, task_pid_nr(current));
1493                         peek_seq = tp->copied_seq;
1494                 }
1495                 continue;
1496
1497         found_ok_skb:
1498                 /* Ok so how much can we use? */
1499                 used = skb->len - offset;
1500                 if (len < used)
1501                         used = len;
1502
1503                 /* Do we have urgent data here? */
1504                 if (tp->urg_data) {
1505                         u32 urg_offset = tp->urg_seq - *seq;
1506                         if (urg_offset < used) {
1507                                 if (!urg_offset) {
1508                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1509                                                 ++*seq;
1510                                                 offset++;
1511                                                 used--;
1512                                                 if (!used)
1513                                                         goto skip_copy;
1514                                         }
1515                                 } else
1516                                         used = urg_offset;
1517                         }
1518                 }
1519
1520                 if (!(flags & MSG_TRUNC)) {
1521 #ifdef CONFIG_NET_DMA
1522                         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1523                                 tp->ucopy.dma_chan = get_softnet_dma();
1524
1525                         if (tp->ucopy.dma_chan) {
1526                                 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1527                                         tp->ucopy.dma_chan, skb, offset,
1528                                         msg->msg_iov, used,
1529                                         tp->ucopy.pinned_list);
1530
1531                                 if (tp->ucopy.dma_cookie < 0) {
1532
1533                                         printk(KERN_ALERT "dma_cookie < 0\n");
1534
1535                                         /* Exception. Bailout! */
1536                                         if (!copied)
1537                                                 copied = -EFAULT;
1538                                         break;
1539                                 }
1540                                 if ((offset + used) == skb->len)
1541                                         copied_early = 1;
1542
1543                         } else
1544 #endif
1545                         {
1546                                 err = skb_copy_datagram_iovec(skb, offset,
1547                                                 msg->msg_iov, used);
1548                                 if (err) {
1549                                         /* Exception. Bailout! */
1550                                         if (!copied)
1551                                                 copied = -EFAULT;
1552                                         break;
1553                                 }
1554                         }
1555                 }
1556
1557                 *seq += used;
1558                 copied += used;
1559                 len -= used;
1560
1561                 tcp_rcv_space_adjust(sk);
1562
1563 skip_copy:
1564                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1565                         tp->urg_data = 0;
1566                         tcp_fast_path_check(sk);
1567                 }
1568                 if (used + offset < skb->len)
1569                         continue;
1570
1571                 if (tcp_hdr(skb)->fin)
1572                         goto found_fin_ok;
1573                 if (!(flags & MSG_PEEK)) {
1574                         sk_eat_skb(sk, skb, copied_early);
1575                         copied_early = 0;
1576                 }
1577                 continue;
1578
1579         found_fin_ok:
1580                 /* Process the FIN. */
1581                 ++*seq;
1582                 if (!(flags & MSG_PEEK)) {
1583                         sk_eat_skb(sk, skb, copied_early);
1584                         copied_early = 0;
1585                 }
1586                 break;
1587         } while (len > 0);
1588
1589         if (user_recv) {
1590                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1591                         int chunk;
1592
1593                         tp->ucopy.len = copied > 0 ? len : 0;
1594
1595                         tcp_prequeue_process(sk);
1596
1597                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1598                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1599                                 len -= chunk;
1600                                 copied += chunk;
1601                         }
1602                 }
1603
1604                 tp->ucopy.task = NULL;
1605                 tp->ucopy.len = 0;
1606         }
1607
1608 #ifdef CONFIG_NET_DMA
1609         if (tp->ucopy.dma_chan) {
1610                 dma_cookie_t done, used;
1611
1612                 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1613
1614                 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1615                                                  tp->ucopy.dma_cookie, &done,
1616                                                  &used) == DMA_IN_PROGRESS) {
1617                         /* do partial cleanup of sk_async_wait_queue */
1618                         while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1619                                (dma_async_is_complete(skb->dma_cookie, done,
1620                                                       used) == DMA_SUCCESS)) {
1621                                 __skb_dequeue(&sk->sk_async_wait_queue);
1622                                 kfree_skb(skb);
1623                         }
1624                 }
1625
1626                 /* Safe to free early-copied skbs now */
1627                 __skb_queue_purge(&sk->sk_async_wait_queue);
1628                 dma_chan_put(tp->ucopy.dma_chan);
1629                 tp->ucopy.dma_chan = NULL;
1630         }
1631         if (tp->ucopy.pinned_list) {
1632                 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1633                 tp->ucopy.pinned_list = NULL;
1634         }
1635 #endif
1636
1637         /* According to UNIX98, msg_name/msg_namelen are ignored
1638          * on connected socket. I was just happy when found this 8) --ANK
1639          */
1640
1641         /* Clean up data we have read: This will do ACK frames. */
1642         tcp_cleanup_rbuf(sk, copied);
1643
1644         TCP_CHECK_TIMER(sk);
1645         release_sock(sk);
1646         return copied;
1647
1648 out:
1649         TCP_CHECK_TIMER(sk);
1650         release_sock(sk);
1651         return err;
1652
1653 recv_urg:
1654         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1655         goto out;
1656 }
1657
1658 /*
1659  *      State processing on a close. This implements the state shift for
1660  *      sending our FIN frame. Note that we only send a FIN for some
1661  *      states. A shutdown() may have already sent the FIN, or we may be
1662  *      closed.
1663  */
1664
1665 static const unsigned char new_state[16] = {
1666   /* current state:        new state:      action:      */
1667   /* (Invalid)          */ TCP_CLOSE,
1668   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1669   /* TCP_SYN_SENT       */ TCP_CLOSE,
1670   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1671   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1672   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1673   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1674   /* TCP_CLOSE          */ TCP_CLOSE,
1675   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1676   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1677   /* TCP_LISTEN         */ TCP_CLOSE,
1678   /* TCP_CLOSING        */ TCP_CLOSING,
1679 };
1680
1681 static int tcp_close_state(struct sock *sk)
1682 {
1683         int next = (int)new_state[sk->sk_state];
1684         int ns = next & TCP_STATE_MASK;
1685
1686         tcp_set_state(sk, ns);
1687
1688         return next & TCP_ACTION_FIN;
1689 }
1690
1691 /*
1692  *      Shutdown the sending side of a connection. Much like close except
1693  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1694  */
1695
1696 void tcp_shutdown(struct sock *sk, int how)
1697 {
1698         /*      We need to grab some memory, and put together a FIN,
1699          *      and then put it into the queue to be sent.
1700          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1701          */
1702         if (!(how & SEND_SHUTDOWN))
1703                 return;
1704
1705         /* If we've already sent a FIN, or it's a closed state, skip this. */
1706         if ((1 << sk->sk_state) &
1707             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1708              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1709                 /* Clear out any half completed packets.  FIN if needed. */
1710                 if (tcp_close_state(sk))
1711                         tcp_send_fin(sk);
1712         }
1713 }
1714
1715 void tcp_close(struct sock *sk, long timeout)
1716 {
1717         struct sk_buff *skb;
1718         int data_was_unread = 0;
1719         int state;
1720
1721         lock_sock(sk);
1722         sk->sk_shutdown = SHUTDOWN_MASK;
1723
1724         if (sk->sk_state == TCP_LISTEN) {
1725                 tcp_set_state(sk, TCP_CLOSE);
1726
1727                 /* Special case. */
1728                 inet_csk_listen_stop(sk);
1729
1730                 goto adjudge_to_death;
1731         }
1732
1733         /*  We need to flush the recv. buffs.  We do this only on the
1734          *  descriptor close, not protocol-sourced closes, because the
1735          *  reader process may not have drained the data yet!
1736          */
1737         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1738                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1739                           tcp_hdr(skb)->fin;
1740                 data_was_unread += len;
1741                 __kfree_skb(skb);
1742         }
1743
1744         sk_stream_mem_reclaim(sk);
1745
1746         /* As outlined in RFC 2525, section 2.17, we send a RST here because
1747          * data was lost. To witness the awful effects of the old behavior of
1748          * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1749          * GET in an FTP client, suspend the process, wait for the client to
1750          * advertise a zero window, then kill -9 the FTP client, wheee...
1751          * Note: timeout is always zero in such a case.
1752          */
1753         if (data_was_unread) {
1754                 /* Unread data was tossed, zap the connection. */
1755                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1756                 tcp_set_state(sk, TCP_CLOSE);
1757                 tcp_send_active_reset(sk, GFP_KERNEL);
1758         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1759                 /* Check zero linger _after_ checking for unread data. */
1760                 sk->sk_prot->disconnect(sk, 0);
1761                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1762         } else if (tcp_close_state(sk)) {
1763                 /* We FIN if the application ate all the data before
1764                  * zapping the connection.
1765                  */
1766
1767                 /* RED-PEN. Formally speaking, we have broken TCP state
1768                  * machine. State transitions:
1769                  *
1770                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1771                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1772                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1773                  *
1774                  * are legal only when FIN has been sent (i.e. in window),
1775                  * rather than queued out of window. Purists blame.
1776                  *
1777                  * F.e. "RFC state" is ESTABLISHED,
1778                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1779                  *
1780                  * The visible declinations are that sometimes
1781                  * we enter time-wait state, when it is not required really
1782                  * (harmless), do not send active resets, when they are
1783                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1784                  * they look as CLOSING or LAST_ACK for Linux)
1785                  * Probably, I missed some more holelets.
1786                  *                                              --ANK
1787                  */
1788                 tcp_send_fin(sk);
1789         }
1790
1791         sk_stream_wait_close(sk, timeout);
1792
1793 adjudge_to_death:
1794         state = sk->sk_state;
1795         sock_hold(sk);
1796         sock_orphan(sk);
1797         atomic_inc(sk->sk_prot->orphan_count);
1798
1799         /* It is the last release_sock in its life. It will remove backlog. */
1800         release_sock(sk);
1801
1802
1803         /* Now socket is owned by kernel and we acquire BH lock
1804            to finish close. No need to check for user refs.
1805          */
1806         local_bh_disable();
1807         bh_lock_sock(sk);
1808         BUG_TRAP(!sock_owned_by_user(sk));
1809
1810         /* Have we already been destroyed by a softirq or backlog? */
1811         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1812                 goto out;
1813
1814         /*      This is a (useful) BSD violating of the RFC. There is a
1815          *      problem with TCP as specified in that the other end could
1816          *      keep a socket open forever with no application left this end.
1817          *      We use a 3 minute timeout (about the same as BSD) then kill
1818          *      our end. If they send after that then tough - BUT: long enough
1819          *      that we won't make the old 4*rto = almost no time - whoops
1820          *      reset mistake.
1821          *
1822          *      Nope, it was not mistake. It is really desired behaviour
1823          *      f.e. on http servers, when such sockets are useless, but
1824          *      consume significant resources. Let's do it with special
1825          *      linger2 option.                                 --ANK
1826          */
1827
1828         if (sk->sk_state == TCP_FIN_WAIT2) {
1829                 struct tcp_sock *tp = tcp_sk(sk);
1830                 if (tp->linger2 < 0) {
1831                         tcp_set_state(sk, TCP_CLOSE);
1832                         tcp_send_active_reset(sk, GFP_ATOMIC);
1833                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1834                 } else {
1835                         const int tmo = tcp_fin_time(sk);
1836
1837                         if (tmo > TCP_TIMEWAIT_LEN) {
1838                                 inet_csk_reset_keepalive_timer(sk,
1839                                                 tmo - TCP_TIMEWAIT_LEN);
1840                         } else {
1841                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1842                                 goto out;
1843                         }
1844                 }
1845         }
1846         if (sk->sk_state != TCP_CLOSE) {
1847                 sk_stream_mem_reclaim(sk);
1848                 if (tcp_too_many_orphans(sk,
1849                                 atomic_read(sk->sk_prot->orphan_count))) {
1850                         if (net_ratelimit())
1851                                 printk(KERN_INFO "TCP: too many of orphaned "
1852                                        "sockets\n");
1853                         tcp_set_state(sk, TCP_CLOSE);
1854                         tcp_send_active_reset(sk, GFP_ATOMIC);
1855                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1856                 }
1857         }
1858
1859         if (sk->sk_state == TCP_CLOSE)
1860                 inet_csk_destroy_sock(sk);
1861         /* Otherwise, socket is reprieved until protocol close. */
1862
1863 out:
1864         bh_unlock_sock(sk);
1865         local_bh_enable();
1866         sock_put(sk);
1867 }
1868
1869 /* These states need RST on ABORT according to RFC793 */
1870
1871 static inline int tcp_need_reset(int state)
1872 {
1873         return (1 << state) &
1874                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1875                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1876 }
1877
1878 int tcp_disconnect(struct sock *sk, int flags)
1879 {
1880         struct inet_sock *inet = inet_sk(sk);
1881         struct inet_connection_sock *icsk = inet_csk(sk);
1882         struct tcp_sock *tp = tcp_sk(sk);
1883         int err = 0;
1884         int old_state = sk->sk_state;
1885
1886         if (old_state != TCP_CLOSE)
1887                 tcp_set_state(sk, TCP_CLOSE);
1888
1889         /* ABORT function of RFC793 */
1890         if (old_state == TCP_LISTEN) {
1891                 inet_csk_listen_stop(sk);
1892         } else if (tcp_need_reset(old_state) ||
1893                    (tp->snd_nxt != tp->write_seq &&
1894                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1895                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1896                  * states
1897                  */
1898                 tcp_send_active_reset(sk, gfp_any());
1899                 sk->sk_err = ECONNRESET;
1900         } else if (old_state == TCP_SYN_SENT)
1901                 sk->sk_err = ECONNRESET;
1902
1903         tcp_clear_xmit_timers(sk);
1904         __skb_queue_purge(&sk->sk_receive_queue);
1905         tcp_write_queue_purge(sk);
1906         __skb_queue_purge(&tp->out_of_order_queue);
1907 #ifdef CONFIG_NET_DMA
1908         __skb_queue_purge(&sk->sk_async_wait_queue);
1909 #endif
1910
1911         inet->dport = 0;
1912
1913         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1914                 inet_reset_saddr(sk);
1915
1916         sk->sk_shutdown = 0;
1917         sock_reset_flag(sk, SOCK_DONE);
1918         tp->srtt = 0;
1919         if ((tp->write_seq += tp->max_window + 2) == 0)
1920                 tp->write_seq = 1;
1921         icsk->icsk_backoff = 0;
1922         tp->snd_cwnd = 2;
1923         icsk->icsk_probes_out = 0;
1924         tp->packets_out = 0;
1925         tp->snd_ssthresh = 0x7fffffff;
1926         tp->snd_cwnd_cnt = 0;
1927         tp->bytes_acked = 0;
1928         tcp_set_ca_state(sk, TCP_CA_Open);
1929         tcp_clear_retrans(tp);
1930         inet_csk_delack_init(sk);
1931         tcp_init_send_head(sk);
1932         memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
1933         __sk_dst_reset(sk);
1934
1935         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1936
1937         sk->sk_error_report(sk);
1938         return err;
1939 }
1940
1941 /*
1942  *      Socket option code for TCP.
1943  */
1944 static int do_tcp_setsockopt(struct sock *sk, int level,
1945                 int optname, char __user *optval, int optlen)
1946 {
1947         struct tcp_sock *tp = tcp_sk(sk);
1948         struct inet_connection_sock *icsk = inet_csk(sk);
1949         int val;
1950         int err = 0;
1951
1952         /* This is a string value all the others are int's */
1953         if (optname == TCP_CONGESTION) {
1954                 char name[TCP_CA_NAME_MAX];
1955
1956                 if (optlen < 1)
1957                         return -EINVAL;
1958
1959                 val = strncpy_from_user(name, optval,
1960                                         min(TCP_CA_NAME_MAX-1, optlen));
1961                 if (val < 0)
1962                         return -EFAULT;
1963                 name[val] = 0;
1964
1965                 lock_sock(sk);
1966                 err = tcp_set_congestion_control(sk, name);
1967                 release_sock(sk);
1968                 return err;
1969         }
1970
1971         if (optlen < sizeof(int))
1972                 return -EINVAL;
1973
1974         if (get_user(val, (int __user *)optval))
1975                 return -EFAULT;
1976
1977         lock_sock(sk);
1978
1979         switch (optname) {
1980         case TCP_MAXSEG:
1981                 /* Values greater than interface MTU won't take effect. However
1982                  * at the point when this call is done we typically don't yet
1983                  * know which interface is going to be used */
1984                 if (val < 8 || val > MAX_TCP_WINDOW) {
1985                         err = -EINVAL;
1986                         break;
1987                 }
1988                 tp->rx_opt.user_mss = val;
1989                 break;
1990
1991         case TCP_NODELAY:
1992                 if (val) {
1993                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1994                          * this option on corked socket is remembered, but
1995                          * it is not activated until cork is cleared.
1996                          *
1997                          * However, when TCP_NODELAY is set we make
1998                          * an explicit push, which overrides even TCP_CORK
1999                          * for currently queued segments.
2000                          */
2001                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2002                         tcp_push_pending_frames(sk);
2003                 } else {
2004                         tp->nonagle &= ~TCP_NAGLE_OFF;
2005                 }
2006                 break;
2007
2008         case TCP_CORK:
2009                 /* When set indicates to always queue non-full frames.
2010                  * Later the user clears this option and we transmit
2011                  * any pending partial frames in the queue.  This is
2012                  * meant to be used alongside sendfile() to get properly
2013                  * filled frames when the user (for example) must write
2014                  * out headers with a write() call first and then use
2015                  * sendfile to send out the data parts.
2016                  *
2017                  * TCP_CORK can be set together with TCP_NODELAY and it is
2018                  * stronger than TCP_NODELAY.
2019                  */
2020                 if (val) {
2021                         tp->nonagle |= TCP_NAGLE_CORK;
2022                 } else {
2023                         tp->nonagle &= ~TCP_NAGLE_CORK;
2024                         if (tp->nonagle&TCP_NAGLE_OFF)
2025                                 tp->nonagle |= TCP_NAGLE_PUSH;
2026                         tcp_push_pending_frames(sk);
2027                 }
2028                 break;
2029
2030         case TCP_KEEPIDLE:
2031                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2032                         err = -EINVAL;
2033                 else {
2034                         tp->keepalive_time = val * HZ;
2035                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2036                             !((1 << sk->sk_state) &
2037                               (TCPF_CLOSE | TCPF_LISTEN))) {
2038                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2039                                 if (tp->keepalive_time > elapsed)
2040                                         elapsed = tp->keepalive_time - elapsed;
2041                                 else
2042                                         elapsed = 0;
2043                                 inet_csk_reset_keepalive_timer(sk, elapsed);
2044                         }
2045                 }
2046                 break;
2047         case TCP_KEEPINTVL:
2048                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2049                         err = -EINVAL;
2050                 else
2051                         tp->keepalive_intvl = val * HZ;
2052                 break;
2053         case TCP_KEEPCNT:
2054                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2055                         err = -EINVAL;
2056                 else
2057                         tp->keepalive_probes = val;
2058                 break;
2059         case TCP_SYNCNT:
2060                 if (val < 1 || val > MAX_TCP_SYNCNT)
2061                         err = -EINVAL;
2062                 else
2063                         icsk->icsk_syn_retries = val;
2064                 break;
2065
2066         case TCP_LINGER2:
2067                 if (val < 0)
2068                         tp->linger2 = -1;
2069                 else if (val > sysctl_tcp_fin_timeout / HZ)
2070                         tp->linger2 = 0;
2071                 else
2072                         tp->linger2 = val * HZ;
2073                 break;
2074
2075         case TCP_DEFER_ACCEPT:
2076                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2077                 if (val > 0) {
2078                         /* Translate value in seconds to number of
2079                          * retransmits */
2080                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2081                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2082                                        icsk->icsk_accept_queue.rskq_defer_accept))
2083                                 icsk->icsk_accept_queue.rskq_defer_accept++;
2084                         icsk->icsk_accept_queue.rskq_defer_accept++;
2085                 }
2086                 break;
2087
2088         case TCP_WINDOW_CLAMP:
2089                 if (!val) {
2090                         if (sk->sk_state != TCP_CLOSE) {
2091                                 err = -EINVAL;
2092                                 break;
2093                         }
2094                         tp->window_clamp = 0;
2095                 } else
2096                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2097                                                 SOCK_MIN_RCVBUF / 2 : val;
2098                 break;
2099
2100         case TCP_QUICKACK:
2101                 if (!val) {
2102                         icsk->icsk_ack.pingpong = 1;
2103                 } else {
2104                         icsk->icsk_ack.pingpong = 0;
2105                         if ((1 << sk->sk_state) &
2106                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2107                             inet_csk_ack_scheduled(sk)) {
2108                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2109                                 tcp_cleanup_rbuf(sk, 1);
2110                                 if (!(val & 1))
2111                                         icsk->icsk_ack.pingpong = 1;
2112                         }
2113                 }
2114                 break;
2115
2116 #ifdef CONFIG_TCP_MD5SIG
2117         case TCP_MD5SIG:
2118                 /* Read the IP->Key mappings from userspace */
2119                 err = tp->af_specific->md5_parse(sk, optval, optlen);
2120                 break;
2121 #endif
2122
2123         default:
2124                 err = -ENOPROTOOPT;
2125                 break;
2126         }
2127
2128         release_sock(sk);
2129         return err;
2130 }
2131
2132 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2133                    int optlen)
2134 {
2135         struct inet_connection_sock *icsk = inet_csk(sk);
2136
2137         if (level != SOL_TCP)
2138                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2139                                                      optval, optlen);
2140         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2141 }
2142
2143 #ifdef CONFIG_COMPAT
2144 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2145                           char __user *optval, int optlen)
2146 {
2147         if (level != SOL_TCP)
2148                 return inet_csk_compat_setsockopt(sk, level, optname,
2149                                                   optval, optlen);
2150         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2151 }
2152
2153 EXPORT_SYMBOL(compat_tcp_setsockopt);
2154 #endif
2155
2156 /* Return information about state of tcp endpoint in API format. */
2157 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2158 {
2159         struct tcp_sock *tp = tcp_sk(sk);
2160         const struct inet_connection_sock *icsk = inet_csk(sk);
2161         u32 now = tcp_time_stamp;
2162
2163         memset(info, 0, sizeof(*info));
2164
2165         info->tcpi_state = sk->sk_state;
2166         info->tcpi_ca_state = icsk->icsk_ca_state;
2167         info->tcpi_retransmits = icsk->icsk_retransmits;
2168         info->tcpi_probes = icsk->icsk_probes_out;
2169         info->tcpi_backoff = icsk->icsk_backoff;
2170
2171         if (tp->rx_opt.tstamp_ok)
2172                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2173         if (tcp_is_sack(tp))
2174                 info->tcpi_options |= TCPI_OPT_SACK;
2175         if (tp->rx_opt.wscale_ok) {
2176                 info->tcpi_options |= TCPI_OPT_WSCALE;
2177                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2178                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2179         }
2180
2181         if (tp->ecn_flags&TCP_ECN_OK)
2182                 info->tcpi_options |= TCPI_OPT_ECN;
2183
2184         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2185         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2186         info->tcpi_snd_mss = tp->mss_cache;
2187         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2188
2189         if (sk->sk_state == TCP_LISTEN) {
2190                 info->tcpi_unacked = sk->sk_ack_backlog;
2191                 info->tcpi_sacked = sk->sk_max_ack_backlog;
2192         } else {
2193                 info->tcpi_unacked = tp->packets_out;
2194                 info->tcpi_sacked = tp->sacked_out;
2195         }
2196         info->tcpi_lost = tp->lost_out;
2197         info->tcpi_retrans = tp->retrans_out;
2198         info->tcpi_fackets = tp->fackets_out;
2199
2200         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2201         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2202         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2203
2204         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2205         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2206         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2207         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2208         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2209         info->tcpi_snd_cwnd = tp->snd_cwnd;
2210         info->tcpi_advmss = tp->advmss;
2211         info->tcpi_reordering = tp->reordering;
2212
2213         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2214         info->tcpi_rcv_space = tp->rcvq_space.space;
2215
2216         info->tcpi_total_retrans = tp->total_retrans;
2217 }
2218
2219 EXPORT_SYMBOL_GPL(tcp_get_info);
2220
2221 static int do_tcp_getsockopt(struct sock *sk, int level,
2222                 int optname, char __user *optval, int __user *optlen)
2223 {
2224         struct inet_connection_sock *icsk = inet_csk(sk);
2225         struct tcp_sock *tp = tcp_sk(sk);
2226         int val, len;
2227
2228         if (get_user(len, optlen))
2229                 return -EFAULT;
2230
2231         len = min_t(unsigned int, len, sizeof(int));
2232
2233         if (len < 0)
2234                 return -EINVAL;
2235
2236         switch (optname) {
2237         case TCP_MAXSEG:
2238                 val = tp->mss_cache;
2239                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2240                         val = tp->rx_opt.user_mss;
2241                 break;
2242         case TCP_NODELAY:
2243                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2244                 break;
2245         case TCP_CORK:
2246                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2247                 break;
2248         case TCP_KEEPIDLE:
2249                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2250                 break;
2251         case TCP_KEEPINTVL:
2252                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2253                 break;
2254         case TCP_KEEPCNT:
2255                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2256                 break;
2257         case TCP_SYNCNT:
2258                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2259                 break;
2260         case TCP_LINGER2:
2261                 val = tp->linger2;
2262                 if (val >= 0)
2263                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2264                 break;
2265         case TCP_DEFER_ACCEPT:
2266                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2267                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2268                 break;
2269         case TCP_WINDOW_CLAMP:
2270                 val = tp->window_clamp;
2271                 break;
2272         case TCP_INFO: {
2273                 struct tcp_info info;
2274
2275                 if (get_user(len, optlen))
2276                         return -EFAULT;
2277
2278                 tcp_get_info(sk, &info);
2279
2280                 len = min_t(unsigned int, len, sizeof(info));
2281                 if (put_user(len, optlen))
2282                         return -EFAULT;
2283                 if (copy_to_user(optval, &info, len))
2284                         return -EFAULT;
2285                 return 0;
2286         }
2287         case TCP_QUICKACK:
2288                 val = !icsk->icsk_ack.pingpong;
2289                 break;
2290
2291         case TCP_CONGESTION:
2292                 if (get_user(len, optlen))
2293                         return -EFAULT;
2294                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2295                 if (put_user(len, optlen))
2296                         return -EFAULT;
2297                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2298                         return -EFAULT;
2299                 return 0;
2300         default:
2301                 return -ENOPROTOOPT;
2302         }
2303
2304         if (put_user(len, optlen))
2305                 return -EFAULT;
2306         if (copy_to_user(optval, &val, len))
2307                 return -EFAULT;
2308         return 0;
2309 }
2310
2311 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2312                    int __user *optlen)
2313 {
2314         struct inet_connection_sock *icsk = inet_csk(sk);
2315
2316         if (level != SOL_TCP)
2317                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2318                                                      optval, optlen);
2319         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2320 }
2321
2322 #ifdef CONFIG_COMPAT
2323 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2324                           char __user *optval, int __user *optlen)
2325 {
2326         if (level != SOL_TCP)
2327                 return inet_csk_compat_getsockopt(sk, level, optname,
2328                                                   optval, optlen);
2329         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2330 }
2331
2332 EXPORT_SYMBOL(compat_tcp_getsockopt);
2333 #endif
2334
2335 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2336 {
2337         struct sk_buff *segs = ERR_PTR(-EINVAL);
2338         struct tcphdr *th;
2339         unsigned thlen;
2340         unsigned int seq;
2341         __be32 delta;
2342         unsigned int oldlen;
2343         unsigned int len;
2344
2345         if (!pskb_may_pull(skb, sizeof(*th)))
2346                 goto out;
2347
2348         th = tcp_hdr(skb);
2349         thlen = th->doff * 4;
2350         if (thlen < sizeof(*th))
2351                 goto out;
2352
2353         if (!pskb_may_pull(skb, thlen))
2354                 goto out;
2355
2356         oldlen = (u16)~skb->len;
2357         __skb_pull(skb, thlen);
2358
2359         if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2360                 /* Packet is from an untrusted source, reset gso_segs. */
2361                 int type = skb_shinfo(skb)->gso_type;
2362                 int mss;
2363
2364                 if (unlikely(type &
2365                              ~(SKB_GSO_TCPV4 |
2366                                SKB_GSO_DODGY |
2367                                SKB_GSO_TCP_ECN |
2368                                SKB_GSO_TCPV6 |
2369                                0) ||
2370                              !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2371                         goto out;
2372
2373                 mss = skb_shinfo(skb)->gso_size;
2374                 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2375
2376                 segs = NULL;
2377                 goto out;
2378         }
2379
2380         segs = skb_segment(skb, features);
2381         if (IS_ERR(segs))
2382                 goto out;
2383
2384         len = skb_shinfo(skb)->gso_size;
2385         delta = htonl(oldlen + (thlen + len));
2386
2387         skb = segs;
2388         th = tcp_hdr(skb);
2389         seq = ntohl(th->seq);
2390
2391         do {
2392                 th->fin = th->psh = 0;
2393
2394                 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2395                                        (__force u32)delta));
2396                 if (skb->ip_summed != CHECKSUM_PARTIAL)
2397                         th->check =
2398                              csum_fold(csum_partial(skb_transport_header(skb),
2399                                                     thlen, skb->csum));
2400
2401                 seq += len;
2402                 skb = skb->next;
2403                 th = tcp_hdr(skb);
2404
2405                 th->seq = htonl(seq);
2406                 th->cwr = 0;
2407         } while (skb->next);
2408
2409         delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2410                       skb->data_len);
2411         th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2412                                 (__force u32)delta));
2413         if (skb->ip_summed != CHECKSUM_PARTIAL)
2414                 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2415                                                    thlen, skb->csum));
2416
2417 out:
2418         return segs;
2419 }
2420 EXPORT_SYMBOL(tcp_tso_segment);
2421
2422 #ifdef CONFIG_TCP_MD5SIG
2423 static unsigned long tcp_md5sig_users;
2424 static struct tcp_md5sig_pool **tcp_md5sig_pool;
2425 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2426
2427 static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2428 {
2429         int cpu;
2430         for_each_possible_cpu(cpu) {
2431                 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2432                 if (p) {
2433                         if (p->md5_desc.tfm)
2434                                 crypto_free_hash(p->md5_desc.tfm);
2435                         kfree(p);
2436                         p = NULL;
2437                 }
2438         }
2439         free_percpu(pool);
2440 }
2441
2442 void tcp_free_md5sig_pool(void)
2443 {
2444         struct tcp_md5sig_pool **pool = NULL;
2445
2446         spin_lock_bh(&tcp_md5sig_pool_lock);
2447         if (--tcp_md5sig_users == 0) {
2448                 pool = tcp_md5sig_pool;
2449                 tcp_md5sig_pool = NULL;
2450         }
2451         spin_unlock_bh(&tcp_md5sig_pool_lock);
2452         if (pool)
2453                 __tcp_free_md5sig_pool(pool);
2454 }
2455
2456 EXPORT_SYMBOL(tcp_free_md5sig_pool);
2457
2458 static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2459 {
2460         int cpu;
2461         struct tcp_md5sig_pool **pool;
2462
2463         pool = alloc_percpu(struct tcp_md5sig_pool *);
2464         if (!pool)
2465                 return NULL;
2466
2467         for_each_possible_cpu(cpu) {
2468                 struct tcp_md5sig_pool *p;
2469                 struct crypto_hash *hash;
2470
2471                 p = kzalloc(sizeof(*p), GFP_KERNEL);
2472                 if (!p)
2473                         goto out_free;
2474                 *per_cpu_ptr(pool, cpu) = p;
2475
2476                 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2477                 if (!hash || IS_ERR(hash))
2478                         goto out_free;
2479
2480                 p->md5_desc.tfm = hash;
2481         }
2482         return pool;
2483 out_free:
2484         __tcp_free_md5sig_pool(pool);
2485         return NULL;
2486 }
2487
2488 struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2489 {
2490         struct tcp_md5sig_pool **pool;
2491         int alloc = 0;
2492
2493 retry:
2494         spin_lock_bh(&tcp_md5sig_pool_lock);
2495         pool = tcp_md5sig_pool;
2496         if (tcp_md5sig_users++ == 0) {
2497                 alloc = 1;
2498                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2499         } else if (!pool) {
2500                 tcp_md5sig_users--;
2501                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2502                 cpu_relax();
2503                 goto retry;
2504         } else
2505                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2506
2507         if (alloc) {
2508                 /* we cannot hold spinlock here because this may sleep. */
2509                 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2510                 spin_lock_bh(&tcp_md5sig_pool_lock);
2511                 if (!p) {
2512                         tcp_md5sig_users--;
2513                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2514                         return NULL;
2515                 }
2516                 pool = tcp_md5sig_pool;
2517                 if (pool) {
2518                         /* oops, it has already been assigned. */
2519                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2520                         __tcp_free_md5sig_pool(p);
2521                 } else {
2522                         tcp_md5sig_pool = pool = p;
2523                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2524                 }
2525         }
2526         return pool;
2527 }
2528
2529 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2530
2531 struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2532 {
2533         struct tcp_md5sig_pool **p;
2534         spin_lock_bh(&tcp_md5sig_pool_lock);
2535         p = tcp_md5sig_pool;
2536         if (p)
2537                 tcp_md5sig_users++;
2538         spin_unlock_bh(&tcp_md5sig_pool_lock);
2539         return (p ? *per_cpu_ptr(p, cpu) : NULL);
2540 }
2541
2542 EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2543
2544 void __tcp_put_md5sig_pool(void)
2545 {
2546         tcp_free_md5sig_pool();
2547 }
2548
2549 EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2550 #endif
2551
2552 void tcp_done(struct sock *sk)
2553 {
2554         if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2555                 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2556
2557         tcp_set_state(sk, TCP_CLOSE);
2558         tcp_clear_xmit_timers(sk);
2559
2560         sk->sk_shutdown = SHUTDOWN_MASK;
2561
2562         if (!sock_flag(sk, SOCK_DEAD))
2563                 sk->sk_state_change(sk);
2564         else
2565                 inet_csk_destroy_sock(sk);
2566 }
2567 EXPORT_SYMBOL_GPL(tcp_done);
2568
2569 extern void __skb_cb_too_small_for_tcp(int, int);
2570 extern struct tcp_congestion_ops tcp_reno;
2571
2572 static __initdata unsigned long thash_entries;
2573 static int __init set_thash_entries(char *str)
2574 {
2575         if (!str)
2576                 return 0;
2577         thash_entries = simple_strtoul(str, &str, 0);
2578         return 1;
2579 }
2580 __setup("thash_entries=", set_thash_entries);
2581
2582 void __init tcp_init(void)
2583 {
2584         struct sk_buff *skb = NULL;
2585         unsigned long limit;
2586         int order, i, max_share;
2587
2588         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2589                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2590                                            sizeof(skb->cb));
2591
2592         tcp_hashinfo.bind_bucket_cachep =
2593                 kmem_cache_create("tcp_bind_bucket",
2594                                   sizeof(struct inet_bind_bucket), 0,
2595                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2596
2597         /* Size and allocate the main established and bind bucket
2598          * hash tables.
2599          *
2600          * The methodology is similar to that of the buffer cache.
2601          */
2602         tcp_hashinfo.ehash =
2603                 alloc_large_system_hash("TCP established",
2604                                         sizeof(struct inet_ehash_bucket),
2605                                         thash_entries,
2606                                         (num_physpages >= 128 * 1024) ?
2607                                         13 : 15,
2608                                         0,
2609                                         &tcp_hashinfo.ehash_size,
2610                                         NULL,
2611                                         thash_entries ? 0 : 512 * 1024);
2612         tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2613         for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2614                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2615                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2616         }
2617         if (inet_ehash_locks_alloc(&tcp_hashinfo))
2618                 panic("TCP: failed to alloc ehash_locks");
2619         tcp_hashinfo.bhash =
2620                 alloc_large_system_hash("TCP bind",
2621                                         sizeof(struct inet_bind_hashbucket),
2622                                         tcp_hashinfo.ehash_size,
2623                                         (num_physpages >= 128 * 1024) ?
2624                                         13 : 15,
2625                                         0,
2626                                         &tcp_hashinfo.bhash_size,
2627                                         NULL,
2628                                         64 * 1024);
2629         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2630         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2631                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2632                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2633         }
2634
2635         /* Try to be a bit smarter and adjust defaults depending
2636          * on available memory.
2637          */
2638         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2639                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2640                         order++)
2641                 ;
2642         if (order >= 4) {
2643                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2644                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2645                 sysctl_max_syn_backlog = 1024;
2646         } else if (order < 3) {
2647                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2648                 sysctl_tcp_max_orphans >>= (3 - order);
2649                 sysctl_max_syn_backlog = 128;
2650         }
2651
2652         /* Set the pressure threshold to be a fraction of global memory that
2653          * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
2654          * memory, with a floor of 128 pages.
2655          */
2656         limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2657         limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2658         limit = max(limit, 128UL);
2659         sysctl_tcp_mem[0] = limit / 4 * 3;
2660         sysctl_tcp_mem[1] = limit;
2661         sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2662
2663         /* Set per-socket limits to no more than 1/128 the pressure threshold */
2664         limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2665         max_share = min(4UL*1024*1024, limit);
2666
2667         sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2668         sysctl_tcp_wmem[1] = 16*1024;
2669         sysctl_tcp_wmem[2] = max(64*1024, max_share);
2670
2671         sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2672         sysctl_tcp_rmem[1] = 87380;
2673         sysctl_tcp_rmem[2] = max(87380, max_share);
2674
2675         printk(KERN_INFO "TCP: Hash tables configured "
2676                "(established %d bind %d)\n",
2677                tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2678
2679         tcp_register_congestion_control(&tcp_reno);
2680 }
2681
2682 EXPORT_SYMBOL(tcp_close);
2683 EXPORT_SYMBOL(tcp_disconnect);
2684 EXPORT_SYMBOL(tcp_getsockopt);
2685 EXPORT_SYMBOL(tcp_ioctl);
2686 EXPORT_SYMBOL(tcp_poll);
2687 EXPORT_SYMBOL(tcp_read_sock);
2688 EXPORT_SYMBOL(tcp_recvmsg);
2689 EXPORT_SYMBOL(tcp_sendmsg);
2690 EXPORT_SYMBOL(tcp_splice_read);
2691 EXPORT_SYMBOL(tcp_sendpage);
2692 EXPORT_SYMBOL(tcp_setsockopt);
2693 EXPORT_SYMBOL(tcp_shutdown);
2694 EXPORT_SYMBOL(tcp_statistics);