2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.87 2001/09/21 21:27:34 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
26 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
27 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
28 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
29 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
30 int sysctl_tcp_retries1 = TCP_RETR1;
31 int sysctl_tcp_retries2 = TCP_RETR2;
32 int sysctl_tcp_orphan_retries;
34 static void tcp_write_timer(unsigned long);
35 static void tcp_delack_timer(unsigned long);
36 static void tcp_keepalive_timer (unsigned long data);
38 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
41 * Using different timers for retransmit, delayed acks and probes
42 * We may wish use just one timer maintaining a list of expire jiffies
46 void tcp_init_xmit_timers(struct sock *sk)
48 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
50 init_timer(&tp->retransmit_timer);
51 tp->retransmit_timer.function=&tcp_write_timer;
52 tp->retransmit_timer.data = (unsigned long) sk;
55 init_timer(&tp->delack_timer);
56 tp->delack_timer.function=&tcp_delack_timer;
57 tp->delack_timer.data = (unsigned long) sk;
60 init_timer(&sk->timer);
61 sk->timer.function=&tcp_keepalive_timer;
62 sk->timer.data = (unsigned long) sk;
65 void tcp_clear_xmit_timers(struct sock *sk)
67 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
70 if (timer_pending(&tp->retransmit_timer) &&
71 del_timer(&tp->retransmit_timer))
76 if (timer_pending(&tp->delack_timer) &&
77 del_timer(&tp->delack_timer))
80 if(timer_pending(&sk->timer) && del_timer(&sk->timer))
84 static void tcp_write_err(struct sock *sk)
86 sk->err = sk->err_soft ? : ETIMEDOUT;
90 NET_INC_STATS_BH(TCPAbortOnTimeout);
93 /* Do not allow orphaned sockets to eat all our resources.
94 * This is direct violation of TCP specs, but it is required
95 * to prevent DoS attacks. It is called when a retransmission timeout
96 * or zero probe timeout occurs on orphaned socket.
98 * Criterium is still not confirmed experimentally and may change.
99 * We kill the socket, if:
100 * 1. If number of orphaned sockets exceeds an administratively configured
102 * 2. If we have strong memory pressure.
104 static int tcp_out_of_resources(struct sock *sk, int do_reset)
106 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
107 int orphans = atomic_read(&tcp_orphan_count);
109 /* If peer does not open window for long time, or did not transmit
110 * anything for long time, penalize it. */
111 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
114 /* If some dubious ICMP arrived, penalize even more. */
118 if (orphans >= sysctl_tcp_max_orphans ||
119 (sk->wmem_queued > SOCK_MIN_SNDBUF &&
120 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
122 printk(KERN_INFO "Out of socket memory\n");
124 /* Catch exceptional cases, when connection requires reset.
125 * 1. Last segment was sent recently. */
126 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
127 /* 2. Window is closed. */
128 (!tp->snd_wnd && !tp->packets_out))
131 tcp_send_active_reset(sk, GFP_ATOMIC);
133 NET_INC_STATS_BH(TCPAbortOnMemory);
139 /* Calculate maximal number or retries on an orphaned socket. */
140 static int tcp_orphan_retries(struct sock *sk, int alive)
142 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
144 /* We know from an ICMP that something is wrong. */
145 if (sk->err_soft && !alive)
148 /* However, if socket sent something recently, select some safe
149 * number of retries. 8 corresponds to >100 seconds with minimal
151 if (retries == 0 && alive)
156 /* A write timeout has occurred. Process the after effects. */
157 static int tcp_write_timeout(struct sock *sk)
159 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
162 if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
164 dst_negative_advice(&sk->dst_cache);
165 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
167 if (tp->retransmits >= sysctl_tcp_retries1) {
168 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
171 It is place to make it. It is not made. I do not want
172 to make it. It is disguisting. It does not work in any
173 case. Let me to cite the same draft, which requires for
174 us to implement this:
176 "The one security concern raised by this memo is that ICMP black holes
177 are often caused by over-zealous security administrators who block
178 all ICMP messages. It is vitally important that those who design and
179 deploy security systems understand the impact of strict filtering on
180 upper-layer protocols. The safest web site in the world is worthless
181 if most TCP implementations cannot transfer data from it. It would
182 be far nicer to have all of the black holes fixed rather than fixing
183 all of the TCP implementations."
188 dst_negative_advice(&sk->dst_cache);
191 retry_until = sysctl_tcp_retries2;
193 int alive = (tp->rto < TCP_RTO_MAX);
195 retry_until = tcp_orphan_retries(sk, alive);
197 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
202 if (tp->retransmits >= retry_until) {
203 /* Has it gone just too far? */
210 static void tcp_delack_timer(unsigned long data)
212 struct sock *sk = (struct sock*)data;
213 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
216 if (sk->lock.users) {
217 /* Try again later. */
219 NET_INC_STATS_BH(DelayedACKLocked);
220 if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN))
227 if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER))
230 if ((long)(tp->ack.timeout - jiffies) > 0) {
231 if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
235 tp->ack.pending &= ~TCP_ACK_TIMER;
237 if (skb_queue_len(&tp->ucopy.prequeue)) {
240 net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue);
242 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
243 sk->backlog_rcv(sk, skb);
245 tp->ucopy.memory = 0;
248 if (tcp_ack_scheduled(tp)) {
249 if (!tp->ack.pingpong) {
250 /* Delayed ACK missed: inflate ATO. */
251 tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
253 /* Delayed ACK missed: leave pingpong mode and
256 tp->ack.pingpong = 0;
257 tp->ack.ato = TCP_ATO_MIN;
260 NET_INC_STATS_BH(DelayedACKs);
265 if (tcp_memory_pressure)
272 static void tcp_probe_timer(struct sock *sk)
274 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
277 if (tp->packets_out || !tp->send_head) {
282 /* *WARNING* RFC 1122 forbids this
284 * It doesn't AFAIK, because we kill the retransmit timer -AK
286 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
287 * this behaviour in Solaris down as a bug fix. [AC]
289 * Let me to explain. probes_out is zeroed by incoming ACKs
290 * even if they advertise zero window. Hence, connection is killed only
291 * if we received no ACKs for normal connection timeout. It is not killed
292 * only because window stays zero for some time, window may be zero
293 * until armageddon and even later. We are in full accordance
294 * with RFCs, only probe timer combines both retransmission timeout
295 * and probe timeout in one bottle. --ANK
297 max_probes = sysctl_tcp_retries2;
300 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
302 max_probes = tcp_orphan_retries(sk, alive);
304 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
308 if (tp->probes_out > max_probes) {
311 /* Only send another probe if we didn't close things up. */
317 * The TCP retransmit timer.
320 static void tcp_retransmit_timer(struct sock *sk)
322 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
324 if (tp->packets_out == 0)
327 BUG_TRAP(!skb_queue_empty(&sk->write_queue));
329 if (tp->snd_wnd == 0 && !sk->dead &&
330 !((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV))) {
331 /* Receiver dastardly shrinks window. Our retransmits
332 * become zero probes, but we should not timeout this
333 * connection. If the socket is an orphan, time it out,
334 * we cannot allow such beasts to hang infinitely.
338 printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
339 NIPQUAD(sk->daddr), htons(sk->dport), sk->num,
340 tp->snd_una, tp->snd_nxt);
342 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
346 tcp_enter_loss(sk, 0);
347 tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
349 goto out_reset_timer;
352 if (tcp_write_timeout(sk))
355 if (tp->retransmits == 0) {
356 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
358 if (tp->ca_state == TCP_CA_Recovery)
359 NET_INC_STATS_BH(TCPSackRecoveryFail);
361 NET_INC_STATS_BH(TCPSackFailures);
363 if (tp->ca_state == TCP_CA_Recovery)
364 NET_INC_STATS_BH(TCPRenoRecoveryFail);
366 NET_INC_STATS_BH(TCPRenoFailures);
368 } else if (tp->ca_state == TCP_CA_Loss) {
369 NET_INC_STATS_BH(TCPLossFailures);
371 NET_INC_STATS_BH(TCPTimeouts);
375 if (tcp_use_frto(sk)) {
378 tcp_enter_loss(sk, 0);
381 if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
382 /* Retransmission failed because of local congestion,
385 if (!tp->retransmits)
387 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
388 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
392 /* Increase the timeout each time we retransmit. Note that
393 * we do not increase the rtt estimate. rto is initialized
394 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
395 * that doubling rto each time is the least we can get away with.
396 * In KA9Q, Karn uses this for the first few times, and then
397 * goes to quadratic. netBSD doubles, but only goes up to *64,
398 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
399 * defined in the protocol as the maximum possible RTT. I guess
400 * we'll have to use something other than TCP to talk to the
401 * University of Mars.
403 * PAWS allows us longer timeouts and large windows, so once
404 * implemented ftp to mars will work nicely. We will have to fix
405 * the 120 second clamps though!
411 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
412 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
413 if (tp->retransmits > sysctl_tcp_retries1)
419 static void tcp_write_timer(unsigned long data)
421 struct sock *sk = (struct sock*)data;
422 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
426 if (sk->lock.users) {
427 /* Try again later */
428 if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20)))
433 if (sk->state == TCP_CLOSE || !tp->pending)
436 if ((long)(tp->timeout - jiffies) > 0) {
437 if (!mod_timer(&tp->retransmit_timer, tp->timeout))
446 case TCP_TIME_RETRANS:
447 tcp_retransmit_timer(sk);
449 case TCP_TIME_PROBE0:
463 * Timer for listening sockets
466 static void tcp_synack_timer(struct sock *sk)
468 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
469 struct tcp_listen_opt *lopt = tp->listen_opt;
470 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
471 int thresh = max_retries;
472 unsigned long now = jiffies;
473 struct open_request **reqp, *req;
476 if (lopt == NULL || lopt->qlen == 0)
479 /* Normally all the openreqs are young and become mature
480 * (i.e. converted to established socket) for first timeout.
481 * If synack was not acknowledged for 3 seconds, it means
482 * one of the following things: synack was lost, ack was lost,
483 * rtt is high or nobody planned to ack (i.e. synflood).
484 * When server is a bit loaded, queue is populated with old
485 * open requests, reducing effective size of queue.
486 * When server is well loaded, queue size reduces to zero
487 * after several minutes of work. It is not synflood,
488 * it is normal operation. The solution is pruning
489 * too old entries overriding normal timeout, when
490 * situation becomes dangerous.
492 * Essentially, we reserve half of room for young
493 * embrions; and abort old ones without pity, if old
494 * ones are about to clog our table.
496 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
497 int young = (lopt->qlen_young<<1);
500 if (lopt->qlen < young)
507 if (tp->defer_accept)
508 max_retries = tp->defer_accept;
510 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
511 i = lopt->clock_hand;
514 reqp=&lopt->syn_table[i];
515 while ((req = *reqp) != NULL) {
516 if ((long)(now - req->expires) >= 0) {
517 if ((req->retrans < thresh ||
518 (req->acked && req->retrans < max_retries))
519 && !req->class->rtx_syn_ack(sk, req, NULL)) {
522 if (req->retrans++ == 0)
524 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
526 req->expires = now + timeo;
527 reqp = &req->dl_next;
531 /* Drop this request */
532 write_lock(&tp->syn_wait_lock);
533 *reqp = req->dl_next;
534 write_unlock(&tp->syn_wait_lock);
536 if (req->retrans == 0)
538 tcp_openreq_free(req);
541 reqp = &req->dl_next;
544 i = (i+1)&(TCP_SYNQ_HSIZE-1);
546 } while (--budget > 0);
548 lopt->clock_hand = i;
551 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
554 void tcp_delete_keepalive_timer (struct sock *sk)
556 if (timer_pending(&sk->timer) && del_timer (&sk->timer))
560 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
562 if (!mod_timer(&sk->timer, jiffies+len))
566 void tcp_set_keepalive(struct sock *sk, int val)
568 if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
571 if (val && !sk->keepopen)
572 tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
574 tcp_delete_keepalive_timer(sk);
578 static void tcp_keepalive_timer (unsigned long data)
580 struct sock *sk = (struct sock *) data;
581 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
584 /* Only process if socket is not in use. */
586 if (sk->lock.users) {
587 /* Try again later. */
588 tcp_reset_keepalive_timer (sk, HZ/20);
592 if (sk->state == TCP_LISTEN) {
593 tcp_synack_timer(sk);
597 if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
598 if (tp->linger2 >= 0) {
599 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
602 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
606 tcp_send_active_reset(sk, GFP_ATOMIC);
610 if (!sk->keepopen || sk->state == TCP_CLOSE)
613 elapsed = keepalive_time_when(tp);
615 /* It is alive without keepalive 8) */
616 if (tp->packets_out || tp->send_head)
619 elapsed = tcp_time_stamp - tp->rcv_tstamp;
621 if (elapsed >= keepalive_time_when(tp)) {
622 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
623 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
624 tcp_send_active_reset(sk, GFP_ATOMIC);
628 if (tcp_write_wakeup(sk) <= 0) {
630 elapsed = keepalive_intvl_when(tp);
632 /* If keepalive was lost due to local congestion,
635 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
638 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
639 elapsed = keepalive_time_when(tp) - elapsed;
646 tcp_reset_keepalive_timer (sk, elapsed);