2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.144 2001/11/06 22:21:08 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/smp_lock.h>
44 /* People can turn this off for buggy TCP's found in printers etc. */
45 int sysctl_tcp_retrans_collapse = 1;
48 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
50 tp->send_head = skb->next;
51 if (tp->send_head == (struct sk_buff *) &sk->write_queue)
53 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
54 if (tp->packets_out++ == 0)
55 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
58 /* SND.NXT, if window was not shrunk.
59 * If window has been shrunk, what should we make? It is not clear at all.
60 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
61 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
62 * invalid. OK, let's make this for now:
64 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
66 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
69 return tp->snd_una+tp->snd_wnd;
72 /* Calculate mss to advertise in SYN segment.
73 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
75 * 1. It is independent of path mtu.
76 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
77 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
78 * attached devices, because some buggy hosts are confused by
80 * 4. We do not make 3, we advertise MSS, calculated from first
81 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
82 * This may be overriden via information stored in routing table.
83 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
84 * probably even Jumbo".
86 static __u16 tcp_advertise_mss(struct sock *sk)
88 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
89 struct dst_entry *dst = __sk_dst_get(sk);
92 if (dst && dst->advmss < mss) {
100 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
101 * This is the first part of cwnd validation mechanism. */
102 static void tcp_cwnd_restart(struct tcp_opt *tp)
104 s32 delta = tcp_time_stamp - tp->lsndtime;
105 u32 restart_cwnd = tcp_init_cwnd(tp);
106 u32 cwnd = tp->snd_cwnd;
108 if (tcp_is_vegas(tp))
109 tcp_vegas_enable(tp);
111 tp->snd_ssthresh = tcp_current_ssthresh(tp);
112 restart_cwnd = min(restart_cwnd, cwnd);
114 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
116 tp->snd_cwnd = max(cwnd, restart_cwnd);
117 tp->snd_cwnd_stamp = tcp_time_stamp;
118 tp->snd_cwnd_used = 0;
121 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
123 u32 now = tcp_time_stamp;
125 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
126 tcp_cwnd_restart(tp);
130 /* If it is a reply for ato after last received
131 * packet, enter pingpong mode.
133 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
134 tp->ack.pingpong = 1;
137 static __inline__ void tcp_event_ack_sent(struct sock *sk)
139 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
141 tcp_dec_quickack_mode(tp);
142 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
145 /* Chose a new window to advertise, update state in tcp_opt for the
146 * socket, and return result with RFC1323 scaling applied. The return
147 * value can be stuffed directly into th->window for an outgoing
150 static __inline__ u16 tcp_select_window(struct sock *sk)
152 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
153 u32 cur_win = tcp_receive_window(tp);
154 u32 new_win = __tcp_select_window(sk);
156 /* Never shrink the offered window */
157 if(new_win < cur_win) {
158 /* Danger Will Robinson!
159 * Don't update rcv_wup/rcv_wnd here or else
160 * we will not be able to advertise a zero
161 * window in time. --DaveM
163 * Relax Will Robinson.
167 tp->rcv_wnd = new_win;
168 tp->rcv_wup = tp->rcv_nxt;
170 /* RFC1323 scaling applied */
171 new_win >>= tp->rcv_wscale;
173 /* If we advertise zero window, disable fast path. */
181 /* This routine actually transmits TCP packets queued in by
182 * tcp_do_sendmsg(). This is used by both the initial
183 * transmission and possible later retransmissions.
184 * All SKB's seen here are completely headerless. It is our
185 * job to build the TCP header, and pass the packet down to
186 * IP so it can do the same plus pass the packet off to the
189 * We are working here with either a clone of the original
190 * SKB, or a fresh unique copy made by the retransmit engine.
192 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
195 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
196 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
197 int tcp_header_size = tp->tcp_header_len;
202 #define SYSCTL_FLAG_TSTAMPS 0x1
203 #define SYSCTL_FLAG_WSCALE 0x2
204 #define SYSCTL_FLAG_SACK 0x4
207 if (tcb->flags & TCPCB_FLAG_SYN) {
208 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
209 if(sysctl_tcp_timestamps) {
210 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
211 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
213 if(sysctl_tcp_window_scaling) {
214 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
215 sysctl_flags |= SYSCTL_FLAG_WSCALE;
217 if(sysctl_tcp_sack) {
218 sysctl_flags |= SYSCTL_FLAG_SACK;
219 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
220 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
222 } else if (tp->eff_sacks) {
223 /* A SACK is 2 pad bytes, a 2 byte header, plus
224 * 2 32-bit sequence numbers for each SACK block.
226 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
227 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
231 * If the connection is idle and we are restarting,
232 * then we don't want to do any Vegas calculations
233 * until we get fresh RTT samples. So when we
234 * restart, we reset our Vegas state to a clean
235 * slate. After we get acks for this flight of
236 * packets, _then_ we can make Vegas calculations
239 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
240 tcp_vegas_enable(tp);
242 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
244 skb_set_owner_w(skb, sk);
246 /* Build TCP header and checksum it. */
247 th->source = sk->sport;
248 th->dest = sk->dport;
249 th->seq = htonl(tcb->seq);
250 th->ack_seq = htonl(tp->rcv_nxt);
251 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
252 if (tcb->flags & TCPCB_FLAG_SYN) {
253 /* RFC1323: The window in SYN & SYN/ACK segments
256 th->window = htons(tp->rcv_wnd);
258 th->window = htons(tcp_select_window(sk));
264 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
265 th->urg_ptr = htons(tp->snd_up-tcb->seq);
269 if (tcb->flags & TCPCB_FLAG_SYN) {
270 tcp_syn_build_options((__u32 *)(th + 1),
271 tcp_advertise_mss(sk),
272 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
273 (sysctl_flags & SYSCTL_FLAG_SACK),
274 (sysctl_flags & SYSCTL_FLAG_WSCALE),
279 tcp_build_and_update_options((__u32 *)(th + 1),
282 TCP_ECN_send(sk, tp, skb, tcp_header_size);
284 tp->af_specific->send_check(sk, th, skb->len, skb);
286 if (tcb->flags & TCPCB_FLAG_ACK)
287 tcp_event_ack_sent(sk);
289 if (skb->len != tcp_header_size)
290 tcp_event_data_sent(tp, skb);
292 TCP_INC_STATS(TcpOutSegs);
294 err = tp->af_specific->queue_xmit(skb, 0);
300 /* NET_XMIT_CN is special. It does not guarantee,
301 * that this packet is lost. It tells that device
302 * is about to start to drop packets or already
303 * drops some packets of the same priority and
304 * invokes us to send less aggressively.
306 return err == NET_XMIT_CN ? 0 : err;
309 #undef SYSCTL_FLAG_TSTAMPS
310 #undef SYSCTL_FLAG_WSCALE
311 #undef SYSCTL_FLAG_SACK
315 /* This is the main buffer sending routine. We queue the buffer
316 * and decide whether to queue or transmit now.
318 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
319 * otherwise socket can stall.
321 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
323 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
325 /* Advance write_seq and place onto the write_queue. */
326 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
327 __skb_queue_tail(&sk->write_queue, skb);
328 tcp_charge_skb(sk, skb);
330 if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
331 /* Send it out now. */
332 TCP_SKB_CB(skb)->when = tcp_time_stamp;
333 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
334 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
335 tcp_minshall_update(tp, cur_mss, skb);
336 if (tp->packets_out++ == 0)
337 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
341 /* Queue it, remembering where we must start sending. */
342 if (tp->send_head == NULL)
346 /* Send _single_ skb sitting at the send head. This function requires
347 * true push pending frames to setup probe timer etc.
349 void tcp_push_one(struct sock *sk, unsigned cur_mss)
351 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
352 struct sk_buff *skb = tp->send_head;
354 if (tcp_snd_test(tp, skb, cur_mss, 1)) {
355 /* Send it out now. */
356 TCP_SKB_CB(skb)->when = tcp_time_stamp;
357 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
358 tp->send_head = NULL;
359 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
360 if (tp->packets_out++ == 0)
361 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
367 /* Split fragmented skb to two parts at length len. */
369 static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
372 int pos = skb->len - skb->data_len;
375 /* Split line is inside header. */
376 memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
378 /* And move data appendix as is. */
379 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
380 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
382 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
383 skb_shinfo(skb)->nr_frags = 0;
385 skb1->data_len = skb->data_len;
386 skb1->len += skb1->data_len;
389 skb->tail = skb->data+len;
392 int nfrags = skb_shinfo(skb)->nr_frags;
394 /* Second chunk has no header, nothing to copy. */
396 skb_shinfo(skb)->nr_frags = 0;
397 skb1->len = skb1->data_len = skb->len - len;
399 skb->data_len = len - pos;
401 for (i=0; i<nfrags; i++) {
402 int size = skb_shinfo(skb)->frags[i].size;
403 if (pos + size > len) {
404 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
408 * We have to variants in this case:
409 * 1. Move all the frag to the second
410 * part, if it is possible. F.e.
411 * this approach is mandatory for TUX,
412 * where splitting is expensive.
413 * 2. Split is accurately. We make this.
415 get_page(skb_shinfo(skb)->frags[i].page);
416 skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
417 skb_shinfo(skb1)->frags[0].size -= (len-pos);
418 skb_shinfo(skb)->frags[i].size = len-pos;
419 skb_shinfo(skb)->nr_frags++;
423 skb_shinfo(skb)->nr_frags++;
427 skb_shinfo(skb1)->nr_frags = k;
431 /* Function to create two new TCP segments. Shrinks the given segment
432 * to the specified size and appends a new segment with the rest of the
433 * packet to the list. This won't be called frequently, I hope.
434 * Remember, these are still headerless SKBs at this point.
436 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
438 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
439 struct sk_buff *buff;
440 int nsize = skb->len - len;
443 if (skb_cloned(skb) &&
444 skb_is_nonlinear(skb) &&
445 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
448 /* Get a new skb... force flag on. */
449 buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
451 return -ENOMEM; /* We'll just try again later. */
452 tcp_charge_skb(sk, buff);
454 /* Correct the sequence numbers. */
455 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
456 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
457 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
459 /* PSH and FIN should only be set in the second packet. */
460 flags = TCP_SKB_CB(skb)->flags;
461 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
462 TCP_SKB_CB(buff)->flags = flags;
463 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
464 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
468 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
470 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
471 /* Copy and checksum data tail into the new buffer. */
472 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
477 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
479 skb->ip_summed = CHECKSUM_HW;
480 skb_split(skb, buff, len);
483 buff->ip_summed = skb->ip_summed;
485 /* Looks stupid, but our code really uses when of
486 * skbs, which it never sent before. --ANK
488 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
490 /* Link BUFF into the send queue. */
491 __skb_append(skb, buff);
496 /* This function synchronize snd mss to current pmtu/exthdr set.
498 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
499 for TCP options, but includes only bare TCP header.
501 tp->mss_clamp is mss negotiated at connection setup.
502 It is minumum of user_mss and mss received with SYN.
503 It also does not include TCP options.
505 tp->pmtu_cookie is last pmtu, seen by this function.
507 tp->mss_cache is current effective sending mss, including
508 all tcp options except for SACKs. It is evaluated,
509 taking into account current pmtu, but never exceeds
512 NOTE1. rfc1122 clearly states that advertised MSS
513 DOES NOT include either tcp or ip options.
515 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
516 this function. --ANK (980731)
519 int tcp_sync_mss(struct sock *sk, u32 pmtu)
521 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
524 /* Calculate base mss without TCP options:
525 It is MMS_S - sizeof(tcphdr) of rfc1122
528 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
530 /* Clamp it (mss_clamp does not include tcp options) */
531 if (mss_now > tp->mss_clamp)
532 mss_now = tp->mss_clamp;
534 /* Now subtract optional transport overhead */
535 mss_now -= tp->ext_header_len;
537 /* Then reserve room for full set of TCP options and 8 bytes of data */
541 /* Now subtract TCP options size, not including SACKs */
542 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
544 /* Bound mss with half of window */
545 if (tp->max_window && mss_now > (tp->max_window>>1))
546 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
548 /* And store cached results */
549 tp->pmtu_cookie = pmtu;
550 tp->mss_cache = mss_now;
555 /* This routine writes packets to the network. It advances the
556 * send_head. This happens as incoming acks open up the remote
559 * Returns 1, if no segments are in flight and we have queued segments, but
560 * cannot send anything now because of SWS or another problem.
562 int tcp_write_xmit(struct sock *sk, int nonagle)
564 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
565 unsigned int mss_now;
567 /* If we are closed, the bytes will have to remain here.
568 * In time closedown will finish, we empty the write queue and all
571 if(sk->state != TCP_CLOSE) {
575 /* Account for SACKS, we may need to fragment due to this.
576 * It is just like the real MSS changing on us midstream.
577 * We also handle things correctly when the user adds some
578 * IP options mid-stream. Silly to do, but cover it.
580 mss_now = tcp_current_mss(sk);
582 while((skb = tp->send_head) &&
583 tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
584 if (skb->len > mss_now) {
585 if (tcp_fragment(sk, skb, mss_now))
589 TCP_SKB_CB(skb)->when = tcp_time_stamp;
590 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
592 /* Advance the send_head. This one is sent out. */
593 update_send_head(sk, tp, skb);
594 tcp_minshall_update(tp, mss_now, skb);
599 tcp_cwnd_validate(sk, tp);
603 return !tp->packets_out && tp->send_head;
608 /* This function returns the amount that we can raise the
609 * usable window based on the following constraints
611 * 1. The window can never be shrunk once it is offered (RFC 793)
612 * 2. We limit memory per socket
615 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
616 * RECV.NEXT + RCV.WIN fixed until:
617 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
619 * i.e. don't raise the right edge of the window until you can raise
620 * it at least MSS bytes.
622 * Unfortunately, the recommended algorithm breaks header prediction,
623 * since header prediction assumes th->window stays fixed.
625 * Strictly speaking, keeping th->window fixed violates the receiver
626 * side SWS prevention criteria. The problem is that under this rule
627 * a stream of single byte packets will cause the right side of the
628 * window to always advance by a single byte.
630 * Of course, if the sender implements sender side SWS prevention
631 * then this will not be a problem.
633 * BSD seems to make the following compromise:
635 * If the free space is less than the 1/4 of the maximum
636 * space available and the free space is less than 1/2 mss,
637 * then set the window to 0.
638 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
639 * Otherwise, just prevent the window from shrinking
640 * and from being larger than the largest representable value.
642 * This prevents incremental opening of the window in the regime
643 * where TCP is limited by the speed of the reader side taking
644 * data out of the TCP receive queue. It does nothing about
645 * those cases where the window is constrained on the sender side
646 * because the pipeline is full.
648 * BSD also seems to "accidentally" limit itself to windows that are a
649 * multiple of MSS, at least until the free space gets quite small.
650 * This would appear to be a side effect of the mbuf implementation.
651 * Combining these two algorithms results in the observed behavior
652 * of having a fixed window size at almost all times.
654 * Below we obtain similar behavior by forcing the offered window to
655 * a multiple of the mss when it is feasible to do so.
657 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
658 * Regular options like TIMESTAMP are taken into account.
660 u32 __tcp_select_window(struct sock *sk)
662 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
663 /* MSS for the peer's data. Previous verions used mss_clamp
664 * here. I don't know if the value based on our guesses
665 * of peer's MSS is better for the performance. It's more correct
666 * but may be worse for the performance because of rcv_mss
667 * fluctuations. --SAW 1998/11/1
669 int mss = tp->ack.rcv_mss;
670 int free_space = tcp_space(sk);
671 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
674 if (mss > full_space)
677 if (free_space < full_space/2) {
680 if (tcp_memory_pressure)
681 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
683 if (free_space < mss)
687 if (free_space > tp->rcv_ssthresh)
688 free_space = tp->rcv_ssthresh;
690 /* Get the largest window that is a nice multiple of mss.
691 * Window clamp already applied above.
692 * If our current window offering is within 1 mss of the
693 * free space we just keep it. This prevents the divide
694 * and multiply from happening most of the time.
695 * We also don't do any window rounding when the free space
698 window = tp->rcv_wnd;
699 if (window <= free_space - mss || window > free_space)
700 window = (free_space/mss)*mss;
701 else if (mss == full_space &&
702 free_space > window + full_space/2)
708 /* Attempt to collapse two adjacent SKB's during retransmission. */
709 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
711 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
712 struct sk_buff *next_skb = skb->next;
714 /* The first test we must make is that neither of these two
715 * SKB's are still referenced by someone else.
717 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
718 int skb_size = skb->len, next_skb_size = next_skb->len;
719 u16 flags = TCP_SKB_CB(skb)->flags;
721 /* Also punt if next skb has been SACK'd. */
722 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
725 /* Next skb is out of window. */
726 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
729 /* Punt if not enough space exists in the first SKB for
730 * the data in the second, or the total combined payload
731 * would exceed the MSS.
733 if ((next_skb_size > skb_tailroom(skb)) ||
734 ((skb_size + next_skb_size) > mss_now))
737 /* Ok. We will be able to collapse the packet. */
738 __skb_unlink(next_skb, next_skb->list);
740 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
742 if (next_skb->ip_summed == CHECKSUM_HW)
743 skb->ip_summed = CHECKSUM_HW;
745 if (skb->ip_summed != CHECKSUM_HW)
746 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
748 /* Update sequence range on original skb. */
749 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
751 /* Merge over control information. */
752 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
753 TCP_SKB_CB(skb)->flags = flags;
755 /* All done, get rid of second SKB and account for it so
756 * packet counting does not break.
758 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
759 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
761 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
765 /* Reno case is special. Sigh... */
766 if (!tp->sack_ok && tp->sacked_out) {
771 /* Not quite right: it can be > snd.fack, but
772 * it is better to underestimate fackets.
776 tcp_free_skb(sk, next_skb);
781 /* Do a simple retransmit without using the backoff mechanisms in
782 * tcp_timer. This is used for path mtu discovery.
783 * The socket is already locked here.
785 void tcp_simple_retransmit(struct sock *sk)
787 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
789 unsigned int mss = tcp_current_mss(sk);
792 for_retrans_queue(skb, sk, tp) {
793 if (skb->len > mss &&
794 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
795 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
796 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
799 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
800 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
810 tcp_sync_left_out(tp);
812 /* Don't muck with the congestion window here.
813 * Reason is that we do not increase amount of _data_
814 * in network, but units changed and effective
815 * cwnd/ssthresh really reduced now.
817 if (tp->ca_state != TCP_CA_Loss) {
818 tp->high_seq = tp->snd_nxt;
819 tp->snd_ssthresh = tcp_current_ssthresh(tp);
820 tp->prior_ssthresh = 0;
822 tcp_set_ca_state(tp, TCP_CA_Loss);
824 tcp_xmit_retransmit_queue(sk);
827 /* This retransmits one SKB. Policy decisions and retransmit queue
828 * state updates are done by the caller. Returns non-zero if an
829 * error occurred which prevented the send.
831 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
833 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
834 unsigned int cur_mss = tcp_current_mss(sk);
837 /* Do not sent more than we queued. 1/4 is reserved for possible
838 * copying overhead: frgagmentation, tunneling, mangling etc.
840 if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
843 /* If receiver has shrunk his window, and skb is out of
844 * new window, do not retransmit it. The exception is the
845 * case, when window is shrunk to zero. In this case
846 * our retransmit serves as a zero window probe.
848 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
849 && TCP_SKB_CB(skb)->seq != tp->snd_una)
852 if(skb->len > cur_mss) {
853 if(tcp_fragment(sk, skb, cur_mss))
854 return -ENOMEM; /* We'll try again later. */
856 /* New SKB created, account for it. */
860 /* Collapse two adjacent packets if worthwhile and we can. */
861 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
862 (skb->len < (cur_mss >> 1)) &&
863 (skb->next != tp->send_head) &&
864 (skb->next != (struct sk_buff *)&sk->write_queue) &&
865 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
866 (sysctl_tcp_retrans_collapse != 0))
867 tcp_retrans_try_collapse(sk, skb, cur_mss);
869 if(tp->af_specific->rebuild_header(sk))
870 return -EHOSTUNREACH; /* Routing failure or similar. */
872 /* Some Solaris stacks overoptimize and ignore the FIN on a
873 * retransmit when old data is attached. So strip it off
874 * since it is cheap to do so and saves bytes on the network.
877 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
878 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
879 if (!pskb_trim(skb, 0)) {
880 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
881 skb->ip_summed = CHECKSUM_NONE;
886 /* Make a copy, if the first transmission SKB clone we made
887 * is still in somebody's hands, else make a clone.
889 TCP_SKB_CB(skb)->when = tcp_time_stamp;
891 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
892 pskb_copy(skb, GFP_ATOMIC):
893 skb_clone(skb, GFP_ATOMIC)));
896 /* Update global TCP statistics. */
897 TCP_INC_STATS(TcpRetransSegs);
899 #if FASTRETRANS_DEBUG > 0
900 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
902 printk(KERN_DEBUG "retrans_out leaked.\n");
905 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
908 /* Save stamp of the first retransmit. */
909 if (!tp->retrans_stamp)
910 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
914 /* snd_nxt is stored to detect loss of retransmitted segment,
915 * see tcp_input.c tcp_sacktag_write_queue().
917 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
922 /* This gets called after a retransmit timeout, and the initially
923 * retransmitted data is acknowledged. It tries to continue
924 * resending the rest of the retransmit queue, until either
925 * we've sent it all or the congestion window limit is reached.
926 * If doing SACK, the first ACK which comes back for a timeout
927 * based retransmit packet might feed us FACK information again.
928 * If so, we use it to avoid unnecessarily retransmissions.
930 void tcp_xmit_retransmit_queue(struct sock *sk)
932 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
934 int packet_cnt = tp->lost_out;
936 /* First pass: retransmit lost packets. */
938 for_retrans_queue(skb, sk, tp) {
939 __u8 sacked = TCP_SKB_CB(skb)->sacked;
941 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
944 if (sacked&TCPCB_LOST) {
945 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
946 if (tcp_retransmit_skb(sk, skb))
948 if (tp->ca_state != TCP_CA_Loss)
949 NET_INC_STATS_BH(TCPFastRetrans);
951 NET_INC_STATS_BH(TCPSlowStartRetrans);
953 if (skb == skb_peek(&sk->write_queue))
954 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
957 if (--packet_cnt <= 0)
963 /* OK, demanded retransmission is finished. */
965 /* Forward retransmissions are possible only during Recovery. */
966 if (tp->ca_state != TCP_CA_Recovery)
969 /* No forward retransmissions in Reno are possible. */
973 /* Yeah, we have to make difficult choice between forward transmission
974 * and retransmission... Both ways have their merits...
976 * For now we do not retrnamsit anything, while we have some new
980 if (tcp_may_send_now(sk, tp))
985 for_retrans_queue(skb, sk, tp) {
986 if(++packet_cnt > tp->fackets_out)
989 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
992 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
995 /* Ok, retransmit it. */
996 if(tcp_retransmit_skb(sk, skb))
999 if (skb == skb_peek(&sk->write_queue))
1000 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1002 NET_INC_STATS_BH(TCPForwardRetrans);
1007 /* Send a fin. The caller locks the socket for us. This cannot be
1008 * allowed to fail queueing a FIN frame under any circumstances.
1010 void tcp_send_fin(struct sock *sk)
1012 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1013 struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
1014 unsigned int mss_now;
1016 /* Optimization, tack on the FIN if we have a queue of
1017 * unsent frames. But be careful about outgoing SACKS
1020 mss_now = tcp_current_mss(sk);
1022 if(tp->send_head != NULL) {
1023 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1024 TCP_SKB_CB(skb)->end_seq++;
1027 /* Socket is locked, keep trying until memory is available. */
1029 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1035 /* Reserve space for headers and prepare control bits. */
1036 skb_reserve(skb, MAX_TCP_HEADER);
1038 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1039 TCP_SKB_CB(skb)->sacked = 0;
1041 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
1042 TCP_SKB_CB(skb)->seq = tp->write_seq;
1043 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1044 tcp_send_skb(sk, skb, 1, mss_now);
1046 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1049 /* We get here when a process closes a file descriptor (either due to
1050 * an explicit close() or as a byproduct of exit()'ing) and there
1051 * was unread data in the receive queue. This behavior is recommended
1052 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1054 void tcp_send_active_reset(struct sock *sk, int priority)
1056 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1057 struct sk_buff *skb;
1059 /* NOTE: No TCP options attached and we never retransmit this. */
1060 skb = alloc_skb(MAX_TCP_HEADER, priority);
1062 NET_INC_STATS(TCPAbortFailed);
1066 /* Reserve space for headers and prepare control bits. */
1067 skb_reserve(skb, MAX_TCP_HEADER);
1069 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1070 TCP_SKB_CB(skb)->sacked = 0;
1073 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1074 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1075 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1076 if (tcp_transmit_skb(sk, skb))
1077 NET_INC_STATS(TCPAbortFailed);
1080 /* WARNING: This routine must only be called when we have already sent
1081 * a SYN packet that crossed the incoming SYN that caused this routine
1082 * to get called. If this assumption fails then the initial rcv_wnd
1083 * and rcv_wscale values will not be correct.
1085 int tcp_send_synack(struct sock *sk)
1087 struct sk_buff* skb;
1089 skb = skb_peek(&sk->write_queue);
1090 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1091 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1094 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1095 if (skb_cloned(skb)) {
1096 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1099 __skb_unlink(skb, &sk->write_queue);
1100 __skb_queue_head(&sk->write_queue, nskb);
1101 tcp_free_skb(sk, skb);
1102 tcp_charge_skb(sk, nskb);
1106 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1107 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1109 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1110 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1114 * Prepare a SYN-ACK.
1116 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1117 struct open_request *req)
1119 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1121 int tcp_header_size;
1122 struct sk_buff *skb;
1124 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1128 /* Reserve space for headers. */
1129 skb_reserve(skb, MAX_TCP_HEADER);
1131 skb->dst = dst_clone(dst);
1133 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1134 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1135 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1136 /* SACK_PERM is in the place of NOP NOP of TS */
1137 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1138 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1140 memset(th, 0, sizeof(struct tcphdr));
1143 TCP_ECN_make_synack(req, th);
1144 th->source = sk->sport;
1145 th->dest = req->rmt_port;
1146 TCP_SKB_CB(skb)->seq = req->snt_isn;
1147 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1148 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1149 th->ack_seq = htonl(req->rcv_isn + 1);
1150 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1152 /* Set this up on the first call only */
1153 req->window_clamp = tp->window_clamp ? : dst->window;
1154 /* tcp_full_space because it is guaranteed to be the first packet */
1155 tcp_select_initial_window(tcp_full_space(sk),
1156 dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1161 req->rcv_wscale = rcv_wscale;
1164 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1165 th->window = htons(req->rcv_wnd);
1167 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1168 tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
1169 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1170 TCP_SKB_CB(skb)->when,
1174 th->doff = (tcp_header_size >> 2);
1175 TCP_INC_STATS(TcpOutSegs);
1180 * Do all connect socket setups that can be done AF independent.
1182 static inline void tcp_connect_init(struct sock *sk)
1184 struct dst_entry *dst = __sk_dst_get(sk);
1185 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1187 /* We'll fix this up when we get a response from the other end.
1188 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1190 tp->tcp_header_len = sizeof(struct tcphdr) +
1191 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1193 /* If user gave his TCP_MAXSEG, record it to clamp */
1195 tp->mss_clamp = tp->user_mss;
1197 tcp_sync_mss(sk, dst->pmtu);
1199 if (!tp->window_clamp)
1200 tp->window_clamp = dst->window;
1201 tp->advmss = dst->advmss;
1202 tcp_initialize_rcv_mss(sk);
1205 tcp_select_initial_window(tcp_full_space(sk),
1206 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1209 sysctl_tcp_window_scaling,
1212 tp->rcv_ssthresh = tp->rcv_wnd;
1217 tcp_init_wl(tp, tp->write_seq, 0);
1218 tp->snd_una = tp->write_seq;
1219 tp->snd_sml = tp->write_seq;
1224 tp->rto = TCP_TIMEOUT_INIT;
1225 tp->retransmits = 0;
1226 tcp_clear_retrans(tp);
1230 * Build a SYN and send it off.
1232 int tcp_connect(struct sock *sk)
1234 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1235 struct sk_buff *buff;
1237 tcp_connect_init(sk);
1239 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
1240 if (unlikely(buff == NULL))
1243 /* Reserve space for headers. */
1244 skb_reserve(buff, MAX_TCP_HEADER);
1246 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1247 TCP_ECN_send_syn(tp, buff);
1248 TCP_SKB_CB(buff)->sacked = 0;
1250 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1251 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1252 tp->snd_nxt = tp->write_seq;
1253 tp->pushed_seq = tp->write_seq;
1257 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1258 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1259 __skb_queue_tail(&sk->write_queue, buff);
1260 tcp_charge_skb(sk, buff);
1262 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1263 TCP_INC_STATS(TcpActiveOpens);
1265 /* Timer for repeating the SYN until an answer. */
1266 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1270 /* Send out a delayed ack, the caller does the policy checking
1271 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1274 void tcp_send_delayed_ack(struct sock *sk)
1276 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1277 int ato = tp->ack.ato;
1278 unsigned long timeout;
1280 if (ato > TCP_DELACK_MIN) {
1283 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1284 max_ato = TCP_DELACK_MAX;
1286 /* Slow path, intersegment interval is "high". */
1288 /* If some rtt estimate is known, use it to bound delayed ack.
1289 * Do not use tp->rto here, use results of rtt measurements
1293 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1299 ato = min(ato, max_ato);
1302 /* Stay within the limit we were given */
1303 timeout = jiffies + ato;
1305 /* Use new timeout only if there wasn't a older one earlier. */
1306 if (tp->ack.pending&TCP_ACK_TIMER) {
1307 /* If delack timer was blocked or is about to expire,
1310 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1315 if (!time_before(timeout, tp->ack.timeout))
1316 timeout = tp->ack.timeout;
1318 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1319 tp->ack.timeout = timeout;
1320 if (!mod_timer(&tp->delack_timer, timeout))
1324 /* This routine sends an ack and also updates the window. */
1325 void tcp_send_ack(struct sock *sk)
1327 /* If we have been reset, we may not send again. */
1328 if(sk->state != TCP_CLOSE) {
1329 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1330 struct sk_buff *buff;
1332 /* We are not putting this on the write queue, so
1333 * tcp_transmit_skb() will set the ownership to this
1336 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1338 tcp_schedule_ack(tp);
1339 tp->ack.ato = TCP_ATO_MIN;
1340 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1344 /* Reserve space for headers and prepare control bits. */
1345 skb_reserve(buff, MAX_TCP_HEADER);
1347 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1348 TCP_SKB_CB(buff)->sacked = 0;
1350 /* Send it off, this clears delayed acks for us. */
1351 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1352 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1353 tcp_transmit_skb(sk, buff);
1357 /* This routine sends a packet with an out of date sequence
1358 * number. It assumes the other end will try to ack it.
1360 * Question: what should we make while urgent mode?
1361 * 4.4BSD forces sending single byte of data. We cannot send
1362 * out of window data, because we have SND.NXT==SND.MAX...
1364 * Current solution: to send TWO zero-length segments in urgent mode:
1365 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1366 * out-of-date with SND.UNA-1 to probe window.
1368 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1370 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1371 struct sk_buff *skb;
1373 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1374 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1378 /* Reserve space for headers and set control bits. */
1379 skb_reserve(skb, MAX_TCP_HEADER);
1381 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1382 TCP_SKB_CB(skb)->sacked = urgent;
1384 /* Use a previous sequence. This should cause the other
1385 * end to send an ack. Don't queue or clone SKB, just
1388 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1389 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1390 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1391 return tcp_transmit_skb(sk, skb);
1394 int tcp_write_wakeup(struct sock *sk)
1396 if (sk->state != TCP_CLOSE) {
1397 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1398 struct sk_buff *skb;
1400 if ((skb = tp->send_head) != NULL &&
1401 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1403 int mss = tcp_current_mss(sk);
1404 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1406 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1407 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1409 /* We are probing the opening of a window
1410 * but the window size is != 0
1411 * must have been a result SWS avoidance ( sender )
1413 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1415 seg_size = min(seg_size, mss);
1416 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1417 if (tcp_fragment(sk, skb, seg_size))
1420 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1421 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1422 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1424 update_send_head(sk, tp, skb);
1429 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1430 tcp_xmit_probe_skb(sk, TCPCB_URG);
1431 return tcp_xmit_probe_skb(sk, 0);
1437 /* A window probe timeout has occurred. If window is not closed send
1438 * a partial packet else a zero probe.
1440 void tcp_send_probe0(struct sock *sk)
1442 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1445 err = tcp_write_wakeup(sk);
1447 if (tp->packets_out || !tp->send_head) {
1448 /* Cancel probe timer, if it is not required. */
1455 if (tp->backoff < sysctl_tcp_retries2)
1458 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1459 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1461 /* If packet was not sent due to local congestion,
1462 * do not backoff and do not remember probes_out.
1463 * Let local senders to fight for local resources.
1465 * Use accumulated backoff yet.
1467 if (!tp->probes_out)
1469 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1470 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));