2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.237.2.1 2002/01/15 08:49:49 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #include <linux/config.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
65 #include <net/inet_common.h>
67 #include <linux/inet.h>
68 #include <linux/stddef.h>
69 #include <linux/ipsec.h>
71 extern int sysctl_ip_dynaddr;
72 extern int sysctl_ip_default_ttl;
73 int sysctl_tcp_tw_reuse = 0;
74 int sysctl_tcp_low_latency = 0;
76 /* Check TCP sequence numbers in ICMP packets. */
77 #define ICMP_MIN_LENGTH 8
79 /* Socket used for sending RSTs */
80 static struct inode tcp_inode;
81 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
87 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
89 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
94 __tcp_listening_hash: { NULL, },
95 __tcp_lhash_lock: RW_LOCK_UNLOCKED,
96 __tcp_lhash_users: ATOMIC_INIT(0),
98 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
99 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
103 * This array holds the first and last local port number.
104 * For high-usage systems, use sysctl to change this to
107 int sysctl_local_port_range[2] = { 1024, 4999 };
108 int tcp_port_rover = (1024 - 1);
110 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
111 __u32 faddr, __u16 fport)
113 int h = ((laddr ^ lport) ^ (faddr ^ fport));
116 return h & (tcp_ehash_size - 1);
119 static __inline__ int tcp_sk_hashfn(struct sock *sk)
121 __u32 laddr = sk->rcv_saddr;
122 __u16 lport = sk->num;
123 __u32 faddr = sk->daddr;
124 __u16 fport = sk->dport;
126 return tcp_hashfn(laddr, lport, faddr, fport);
129 /* Allocate and initialize a new TCP local port bind bucket.
130 * The bindhash mutex for snum's hash chain must be held here.
132 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
135 struct tcp_bind_bucket *tb;
137 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
142 if((tb->next = head->chain) != NULL)
143 tb->next->pprev = &tb->next;
145 tb->pprev = &head->chain;
150 /* Caller must disable local BH processing. */
151 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
153 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
154 struct tcp_bind_bucket *tb;
156 spin_lock(&head->lock);
157 tb = (struct tcp_bind_bucket *)sk->prev;
158 if ((child->bind_next = tb->owners) != NULL)
159 tb->owners->bind_pprev = &child->bind_next;
161 child->bind_pprev = &tb->owners;
162 child->prev = (struct sock *) tb;
163 spin_unlock(&head->lock);
166 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
169 __tcp_inherit_port(sk, child);
173 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
176 if ((sk->bind_next = tb->owners) != NULL)
177 tb->owners->bind_pprev = &sk->bind_next;
179 sk->bind_pprev = &tb->owners;
180 sk->prev = (struct sock *) tb;
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
185 struct sock *sk2 = tb->owners;
186 int sk_reuse = sk->reuse;
188 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
191 !ipv6_only_sock(sk2) &&
192 (!sk->bound_dev_if ||
193 !sk2->bound_dev_if ||
194 sk->bound_dev_if == sk2->bound_dev_if)) {
197 sk2->state == TCP_LISTEN) {
198 if (!sk2->rcv_saddr ||
200 (sk2->rcv_saddr == sk->rcv_saddr))
208 /* Obtain a reference to a local port for the given sock,
209 * if snum is zero it means select any available local port.
211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
213 struct tcp_bind_hashbucket *head;
214 struct tcp_bind_bucket *tb;
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
227 if ((rover < low) || (rover > high))
229 head = &tcp_bhash[tcp_bhashfn(rover)];
230 spin_lock(&head->lock);
231 for (tb = head->chain; tb; tb = tb->next)
232 if (tb->port == rover)
236 spin_unlock(&head->lock);
237 } while (--remaining > 0);
238 tcp_port_rover = rover;
239 spin_unlock(&tcp_portalloc_lock);
241 /* Exhausted local port range during search? */
246 /* OK, here is the one we will use. HEAD is
247 * non-NULL and we hold it's mutex.
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 for (tb = head->chain; tb != NULL; tb = tb->next)
255 if (tb->port == snum)
258 if (tb != NULL && tb->owners != NULL) {
261 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
265 if (tcp_bind_conflict(sk, tb))
271 (tb = tcp_bucket_create(head, snum)) == NULL)
273 if (tb->owners == NULL) {
274 if (sk->reuse && sk->state != TCP_LISTEN)
278 } else if (tb->fastreuse &&
279 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
282 if (sk->prev == NULL)
283 tcp_bind_hash(sk, tb, snum);
284 BUG_TRAP(sk->prev == (struct sock *) tb);
288 spin_unlock(&head->lock);
294 /* Get rid of any references to a local port held by the
297 inline void __tcp_put_port(struct sock *sk)
299 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
300 struct tcp_bind_bucket *tb;
302 spin_lock(&head->lock);
303 tb = (struct tcp_bind_bucket *) sk->prev;
305 sk->bind_next->bind_pprev = sk->bind_pprev;
306 *(sk->bind_pprev) = sk->bind_next;
309 if (tb->owners == NULL) {
311 tb->next->pprev = tb->pprev;
312 *(tb->pprev) = tb->next;
313 kmem_cache_free(tcp_bucket_cachep, tb);
315 spin_unlock(&head->lock);
318 void tcp_put_port(struct sock *sk)
325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
326 * Look, when several writers sleep and reader wakes them up, all but one
327 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
328 * this, _but_ remember, it adds useless work on UP machines (wake up each
329 * exclusive lock release). It should be ifdefed really.
332 void tcp_listen_wlock(void)
334 write_lock(&tcp_lhash_lock);
336 if (atomic_read(&tcp_lhash_users)) {
337 DECLARE_WAITQUEUE(wait, current);
339 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
341 set_current_state(TASK_UNINTERRUPTIBLE);
342 if (atomic_read(&tcp_lhash_users) == 0)
344 write_unlock_bh(&tcp_lhash_lock);
346 write_lock_bh(&tcp_lhash_lock);
349 __set_current_state(TASK_RUNNING);
350 remove_wait_queue(&tcp_lhash_wait, &wait);
354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
359 BUG_TRAP(sk->pprev==NULL);
360 if(listen_possible && sk->state == TCP_LISTEN) {
361 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362 lock = &tcp_lhash_lock;
365 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
366 lock = &tcp_ehash[sk->hashent].lock;
369 if((sk->next = *skp) != NULL)
370 (*skp)->pprev = &sk->next;
373 sock_prot_inc_use(sk->prot);
375 if (listen_possible && sk->state == TCP_LISTEN)
376 wake_up(&tcp_lhash_wait);
379 static void tcp_v4_hash(struct sock *sk)
381 if (sk->state != TCP_CLOSE) {
383 __tcp_v4_hash(sk, 1);
388 void tcp_unhash(struct sock *sk)
395 if (sk->state == TCP_LISTEN) {
398 lock = &tcp_lhash_lock;
400 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
402 write_lock_bh(&head->lock);
407 sk->next->pprev = sk->pprev;
408 *sk->pprev = sk->next;
410 sock_prot_dec_use(sk->prot);
412 write_unlock_bh(lock);
415 if (sk->state == TCP_LISTEN)
416 wake_up(&tcp_lhash_wait);
419 /* Don't inline this cruft. Here are some nice properties to
420 * exploit here. The BSD API does not allow a listening TCP
421 * to specify the remote port nor the remote address for the
422 * connection. So always assume those are both wildcarded
423 * during the search since they can never be otherwise.
425 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
427 struct sock *result = NULL;
431 for(; sk; sk = sk->next) {
432 if(sk->num == hnum && !ipv6_only_sock(sk)) {
433 __u32 rcv_saddr = sk->rcv_saddr;
435 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
436 score = sk->family == PF_INET ? 1 : 0;
441 if (rcv_saddr != daddr)
445 if (sk->bound_dev_if) {
446 if (sk->bound_dev_if != dif)
452 if (score > hiscore) {
461 /* Optimize the common listener case. */
462 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
466 read_lock(&tcp_lhash_lock);
467 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
469 if (sk->num == hnum &&
471 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
472 (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
475 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
481 read_unlock(&tcp_lhash_lock);
485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
488 * Local BH must be disabled here.
491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492 u32 daddr, u16 hnum, int dif)
494 struct tcp_ehash_bucket *head;
495 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
496 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
500 /* Optimize here for direct hit, only listening connections can
501 * have wildcards anyways.
503 hash = tcp_hashfn(daddr, hnum, saddr, sport);
504 head = &tcp_ehash[hash];
505 read_lock(&head->lock);
506 for(sk = head->chain; sk; sk = sk->next) {
507 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508 goto hit; /* You sunk my battleship! */
511 /* Must check for a TIME_WAIT'er before going to listener hash. */
512 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
513 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
515 read_unlock(&head->lock);
521 read_unlock(&head->lock);
525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526 u32 daddr, u16 hnum, int dif)
530 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
535 return tcp_v4_lookup_listener(daddr, hnum, dif);
538 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
543 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
549 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
551 return secure_tcp_sequence_number(skb->nh.iph->daddr,
557 /* called with local bh disabled */
558 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
559 struct tcp_tw_bucket **twp)
561 u32 daddr = sk->rcv_saddr;
562 u32 saddr = sk->daddr;
563 int dif = sk->bound_dev_if;
564 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565 __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
566 int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
567 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568 struct sock *sk2, **skp;
569 struct tcp_tw_bucket *tw;
571 write_lock(&head->lock);
573 /* Check TIME-WAIT sockets first. */
574 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
576 tw = (struct tcp_tw_bucket*)sk2;
578 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
581 /* With PAWS, it is safe from the viewpoint
582 of data integrity. Even without PAWS it
583 is safe provided sequence spaces do not
584 overlap i.e. at data rates <= 80Mbit/sec.
586 Actually, the idea is close to VJ's one,
587 only timestamp cache is held not per host,
588 but per port pair and TW bucket is used
591 If TW bucket has been already destroyed we
592 fall back to VJ's scheme and use initial
593 timestamp retrieved from peer table.
595 if (tw->ts_recent_stamp &&
596 (!twp || (sysctl_tcp_tw_reuse &&
597 xtime.tv_sec - tw->ts_recent_stamp > 1))) {
598 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
600 tp->ts_recent = tw->ts_recent;
601 tp->ts_recent_stamp = tw->ts_recent_stamp;
611 /* And established part... */
612 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
613 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
618 /* Must record num and sport now. Otherwise we will see
619 * in hash table socket with a funny identity. */
621 sk->sport = htons(lport);
622 BUG_TRAP(sk->pprev==NULL);
623 if ((sk->next = *skp) != NULL)
624 (*skp)->pprev = &sk->next;
629 sock_prot_inc_use(sk->prot);
630 write_unlock(&head->lock);
634 NET_INC_STATS_BH(TimeWaitRecycled);
636 /* Silly. Should hash-dance instead... */
637 tcp_tw_deschedule(tw);
638 tcp_timewait_kill(tw);
639 NET_INC_STATS_BH(TimeWaitRecycled);
647 write_unlock(&head->lock);
648 return -EADDRNOTAVAIL;
652 * Bind a port for a connect operation and hash it.
654 static int tcp_v4_hash_connect(struct sock *sk)
656 unsigned short snum = sk->num;
657 struct tcp_bind_hashbucket *head;
658 struct tcp_bind_bucket *tb;
662 int low = sysctl_local_port_range[0];
663 int high = sysctl_local_port_range[1];
664 int remaining = (high - low) + 1;
665 struct tcp_tw_bucket *tw = NULL;
669 /* TODO. Actually it is not so bad idea to remove
670 * tcp_portalloc_lock before next submission to Linus.
671 * As soon as we touch this place at all it is time to think.
673 * Now it protects single _advisory_ variable tcp_port_rover,
674 * hence it is mostly useless.
675 * Code will work nicely if we just delete it, but
676 * I am afraid in contented case it will work not better or
677 * even worse: another cpu just will hit the same bucket
679 * So some cpu salt could remove both contention and
680 * memory pingpong. Any ideas how to do this in a nice way?
682 spin_lock(&tcp_portalloc_lock);
683 rover = tcp_port_rover;
687 if ((rover < low) || (rover > high))
689 head = &tcp_bhash[tcp_bhashfn(rover)];
690 spin_lock(&head->lock);
692 /* Does not bother with rcv_saddr checks,
693 * because the established check is already
696 for (tb = head->chain; tb; tb = tb->next) {
697 if (tb->port == rover) {
698 BUG_TRAP(tb->owners != NULL);
699 if (tb->fastreuse >= 0)
701 if (!__tcp_v4_check_established(sk, rover, &tw))
707 tb = tcp_bucket_create(head, rover);
709 spin_unlock(&head->lock);
716 spin_unlock(&head->lock);
717 } while (--remaining > 0);
718 tcp_port_rover = rover;
719 spin_unlock(&tcp_portalloc_lock);
723 return -EADDRNOTAVAIL;
726 /* All locks still held and bhs disabled */
727 tcp_port_rover = rover;
728 spin_unlock(&tcp_portalloc_lock);
730 tcp_bind_hash(sk, tb, rover);
732 sk->sport = htons(rover);
733 __tcp_v4_hash(sk, 0);
735 spin_unlock(&head->lock);
738 tcp_tw_deschedule(tw);
739 tcp_timewait_kill(tw);
747 head = &tcp_bhash[tcp_bhashfn(snum)];
748 tb = (struct tcp_bind_bucket *)sk->prev;
749 spin_lock_bh(&head->lock);
750 if (tb->owners == sk && sk->bind_next == NULL) {
751 __tcp_v4_hash(sk, 0);
752 spin_unlock_bh(&head->lock);
756 spin_unlock(&head->lock);
757 /* No definite answer... Walk to established hash table */
758 ret = __tcp_v4_check_established(sk, snum, NULL);
764 /* This will initiate an outgoing connection. */
765 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
767 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
774 if (addr_len < sizeof(struct sockaddr_in))
777 if (usin->sin_family != AF_INET)
778 return(-EAFNOSUPPORT);
780 nexthop = daddr = usin->sin_addr.s_addr;
781 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
784 nexthop = sk->protinfo.af_inet.opt->faddr;
787 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
788 RT_CONN_FLAGS(sk), sk->bound_dev_if);
792 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
797 __sk_dst_set(sk, &rt->u.dst);
798 sk->route_caps = rt->u.dst.dev->features;
800 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
804 sk->saddr = rt->rt_src;
805 sk->rcv_saddr = sk->saddr;
807 if (tp->ts_recent_stamp && sk->daddr != daddr) {
808 /* Reset inherited state */
810 tp->ts_recent_stamp = 0;
814 if (sysctl_tcp_tw_recycle &&
815 !tp->ts_recent_stamp &&
816 rt->rt_dst == daddr) {
817 struct inet_peer *peer = rt_get_peer(rt);
819 /* VJ's idea. We save last timestamp seen from
820 * the destination in peer table, when entering state TIME-WAIT
821 * and initialize ts_recent from it, when trying new connection.
824 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
825 tp->ts_recent_stamp = peer->tcp_ts_stamp;
826 tp->ts_recent = peer->tcp_ts;
830 sk->dport = usin->sin_port;
833 tp->ext_header_len = 0;
834 if (sk->protinfo.af_inet.opt)
835 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
839 /* Socket identity is still unknown (sport may be zero).
840 * However we set state to SYN-SENT and not releasing socket
841 * lock select source port, enter ourselves into the hash tables and
842 * complete initalization after this.
844 tcp_set_state(sk, TCP_SYN_SENT);
845 err = tcp_v4_hash_connect(sk);
850 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
851 sk->sport, usin->sin_port);
853 sk->protinfo.af_inet.id = tp->write_seq^jiffies;
855 err = tcp_connect(sk);
862 tcp_set_state(sk, TCP_CLOSE);
869 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
871 return ((struct rtable*)skb->dst)->rt_iif;
874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
876 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880 struct open_request ***prevp,
882 __u32 raddr, __u32 laddr)
884 struct tcp_listen_opt *lopt = tp->listen_opt;
885 struct open_request *req, **prev;
887 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888 (req = *prev) != NULL;
889 prev = &req->dl_next) {
890 if (req->rmt_port == rport &&
891 req->af.v4_req.rmt_addr == raddr &&
892 req->af.v4_req.loc_addr == laddr &&
893 TCP_INET_FAMILY(req->class->family)) {
894 BUG_TRAP(req->sk == NULL);
903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
905 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906 struct tcp_listen_opt *lopt = tp->listen_opt;
907 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
909 req->expires = jiffies + TCP_TIMEOUT_INIT;
912 req->dl_next = lopt->syn_table[h];
914 write_lock(&tp->syn_wait_lock);
915 lopt->syn_table[h] = req;
916 write_unlock(&tp->syn_wait_lock);
923 * This routine does path mtu discovery as defined in RFC1191.
925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
927 struct dst_entry *dst;
928 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
930 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931 * send out by Linux are always <576bytes so they should go through
934 if (sk->state == TCP_LISTEN)
937 /* We don't check in the destentry if pmtu discovery is forbidden
938 * on this route. We just assume that no packet_to_big packets
939 * are send back when pmtu discovery is not active.
940 * There is a small race when the user changes this flag in the
941 * route, but I think that's acceptable.
943 if ((dst = __sk_dst_check(sk, 0)) == NULL)
946 ip_rt_update_pmtu(dst, mtu);
948 /* Something is about to be wrong... Remember soft error
949 * for the case, if this connection will not able to recover.
951 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952 sk->err_soft = EMSGSIZE;
954 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955 tp->pmtu_cookie > dst->pmtu) {
956 tcp_sync_mss(sk, dst->pmtu);
958 /* Resend the TCP packet because it's
959 * clear that the old packet has been
960 * dropped. This is the new "fast" path mtu
963 tcp_simple_retransmit(sk);
964 } /* else let the usual retransmit timer handle it */
968 * This routine is called by the ICMP module when it gets some
969 * sort of error condition. If err < 0 then the socket should
970 * be closed and the error returned to the user. If err > 0
971 * it's just the icmp type << 8 | icmp code. After adjustment
972 * header points to the first 8 bytes of the tcp header. We need
973 * to find the appropriate port.
975 * The locking strategy used here is very "optimistic". When
976 * someone else accesses the socket the ICMP is just dropped
977 * and for some paths there is no check at all.
978 * A more general error queue to queue errors for later handling
979 * is probably better.
983 void tcp_v4_err(struct sk_buff *skb, u32 info)
985 struct iphdr *iph = (struct iphdr*)skb->data;
986 struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
988 int type = skb->h.icmph->type;
989 int code = skb->h.icmph->code;
994 if (skb->len < (iph->ihl << 2) + 8) {
995 ICMP_INC_STATS_BH(IcmpInErrors);
999 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1001 ICMP_INC_STATS_BH(IcmpInErrors);
1004 if (sk->state == TCP_TIME_WAIT) {
1005 tcp_tw_put((struct tcp_tw_bucket*)sk);
1010 /* If too many ICMPs get dropped on busy
1011 * servers this needs to be solved differently.
1013 if (sk->lock.users != 0)
1014 NET_INC_STATS_BH(LockDroppedIcmps);
1016 if (sk->state == TCP_CLOSE)
1019 tp = &sk->tp_pinfo.af_tcp;
1020 seq = ntohl(th->seq);
1021 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022 NET_INC_STATS(OutOfWindowIcmps);
1027 case ICMP_SOURCE_QUENCH:
1028 /* Just silently ignore these. */
1030 case ICMP_PARAMETERPROB:
1033 case ICMP_DEST_UNREACH:
1034 if (code > NR_ICMP_UNREACH)
1037 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1038 if (sk->lock.users == 0)
1039 do_pmtu_discovery(sk, iph, info);
1043 err = icmp_err_convert[code].errno;
1045 case ICMP_TIME_EXCEEDED:
1052 switch (sk->state) {
1053 struct open_request *req, **prev;
1055 if (sk->lock.users != 0)
1058 req = tcp_v4_search_req(tp, &prev,
1060 iph->daddr, iph->saddr);
1064 /* ICMPs are not backlogged, hence we cannot get
1065 an established socket here.
1067 BUG_TRAP(req->sk == NULL);
1069 if (seq != req->snt_isn) {
1070 NET_INC_STATS_BH(OutOfWindowIcmps);
1075 * Still in SYN_RECV, just remove it silently.
1076 * There is no good way to pass the error to the newly
1077 * created socket, and POSIX does not want network
1078 * errors returned from accept().
1080 tcp_synq_drop(sk, req, prev);
1084 case TCP_SYN_RECV: /* Cannot happen.
1085 It can f.e. if SYNs crossed.
1087 if (sk->lock.users == 0) {
1088 TCP_INC_STATS_BH(TcpAttemptFails);
1091 sk->error_report(sk);
1100 /* If we've already connected we will keep trying
1101 * until we time out, or the user gives up.
1103 * rfc1122 4.2.3.9 allows to consider as hard errors
1104 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1105 * but it is obsoleted by pmtu discovery).
1107 * Note, that in modern internet, where routing is unreliable
1108 * and in each dark corner broken firewalls sit, sending random
1109 * errors ordered by their masters even this two messages finally lose
1110 * their original sense (even Linux sends invalid PORT_UNREACHs)
1112 * Now we are in compliance with RFCs.
1116 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1118 sk->error_report(sk);
1119 } else { /* Only an error on timeout */
1128 /* This routine computes an IPv4 TCP checksum. */
1129 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1130 struct sk_buff *skb)
1132 if (skb->ip_summed == CHECKSUM_HW) {
1133 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1134 skb->csum = offsetof(struct tcphdr, check);
1136 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1137 csum_partial((char *)th, th->doff<<2, skb->csum));
1142 * This routine will send an RST to the other tcp.
1144 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1146 * Answer: if a packet caused RST, it is not for a socket
1147 * existing in our system, if it is matched to a socket,
1148 * it is just duplicate segment or bug in other side's TCP.
1149 * So that we build reply only basing on parameters
1150 * arrived with segment.
1151 * Exception: precedence violation. We do not implement it in any case.
1154 static void tcp_v4_send_reset(struct sk_buff *skb)
1156 struct tcphdr *th = skb->h.th;
1158 struct ip_reply_arg arg;
1160 /* Never send a reset in response to a reset. */
1164 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1167 /* Swap the send and the receive. */
1168 memset(&rth, 0, sizeof(struct tcphdr));
1169 rth.dest = th->source;
1170 rth.source = th->dest;
1171 rth.doff = sizeof(struct tcphdr)/4;
1175 rth.seq = th->ack_seq;
1178 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1179 + skb->len - (th->doff<<2));
1182 memset(&arg, 0, sizeof arg);
1183 arg.iov[0].iov_base = (unsigned char *)&rth;
1184 arg.iov[0].iov_len = sizeof rth;
1185 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1186 skb->nh.iph->saddr, /*XXX*/
1187 sizeof(struct tcphdr),
1191 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1193 tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1194 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1196 TCP_INC_STATS_BH(TcpOutSegs);
1197 TCP_INC_STATS_BH(TcpOutRsts);
1200 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1201 outside socket context is ugly, certainly. What can I do?
1204 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1206 struct tcphdr *th = skb->h.th;
1211 struct ip_reply_arg arg;
1213 memset(&rep.th, 0, sizeof(struct tcphdr));
1214 memset(&arg, 0, sizeof arg);
1216 arg.iov[0].iov_base = (unsigned char *)&rep;
1217 arg.iov[0].iov_len = sizeof(rep.th);
1220 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1221 (TCPOPT_NOP << 16) |
1222 (TCPOPT_TIMESTAMP << 8) |
1224 rep.tsopt[1] = htonl(tcp_time_stamp);
1225 rep.tsopt[2] = htonl(ts);
1226 arg.iov[0].iov_len = sizeof(rep);
1229 /* Swap the send and the receive. */
1230 rep.th.dest = th->source;
1231 rep.th.source = th->dest;
1232 rep.th.doff = arg.iov[0].iov_len/4;
1233 rep.th.seq = htonl(seq);
1234 rep.th.ack_seq = htonl(ack);
1236 rep.th.window = htons(win);
1238 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1239 skb->nh.iph->saddr, /*XXX*/
1243 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1245 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1247 TCP_INC_STATS_BH(TcpOutSegs);
1250 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1252 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1254 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1255 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1260 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1262 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1266 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1269 struct ip_options *opt;
1271 opt = req->af.v4_req.opt;
1272 if(ip_route_output(&rt, ((opt && opt->srr) ?
1274 req->af.v4_req.rmt_addr),
1275 req->af.v4_req.loc_addr,
1276 RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1277 IP_INC_STATS_BH(IpOutNoRoutes);
1280 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1282 IP_INC_STATS_BH(IpOutNoRoutes);
1289 * Send a SYN-ACK after having received an ACK.
1290 * This still operates on a open_request only, not on a big
1293 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1294 struct dst_entry *dst)
1297 struct sk_buff * skb;
1299 /* First, grab a route. */
1301 (dst = tcp_v4_route_req(sk, req)) == NULL)
1304 skb = tcp_make_synack(sk, dst, req);
1307 struct tcphdr *th = skb->h.th;
1309 th->check = tcp_v4_check(th, skb->len,
1310 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1311 csum_partial((char *)th, skb->len, skb->csum));
1313 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1314 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1315 if (err == NET_XMIT_CN)
1325 * IPv4 open_request destructor.
1327 static void tcp_v4_or_free(struct open_request *req)
1329 if (req->af.v4_req.opt)
1330 kfree(req->af.v4_req.opt);
1333 static inline void syn_flood_warning(struct sk_buff *skb)
1335 static unsigned long warntime;
1337 if (jiffies - warntime > HZ*60) {
1340 "possible SYN flooding on port %d. Sending cookies.\n",
1341 ntohs(skb->h.th->dest));
1346 * Save and compile IPv4 options into the open_request if needed.
1348 static inline struct ip_options *
1349 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1351 struct ip_options *opt = &(IPCB(skb)->opt);
1352 struct ip_options *dopt = NULL;
1354 if (opt && opt->optlen) {
1355 int opt_size = optlength(opt);
1356 dopt = kmalloc(opt_size, GFP_ATOMIC);
1358 if (ip_options_echo(dopt, skb)) {
1368 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1369 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1370 * It would be better to replace it with a global counter for all sockets
1371 * but then some measure against one socket starving all other sockets
1374 * It was 128 by default. Experiments with real servers show, that
1375 * it is absolutely not enough even at 100conn/sec. 256 cures most
1376 * of problems. This value is adjusted to 128 for very small machines
1377 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1378 * Further increasing requires to change hash table size.
1380 int sysctl_max_syn_backlog = 256;
1382 struct or_calltable or_ipv4 = {
1390 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1393 struct open_request *req;
1394 __u32 saddr = skb->nh.iph->saddr;
1395 __u32 daddr = skb->nh.iph->daddr;
1396 __u32 isn = TCP_SKB_CB(skb)->when;
1397 struct dst_entry *dst = NULL;
1398 #ifdef CONFIG_SYN_COOKIES
1399 int want_cookie = 0;
1401 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1404 /* Never answer to SYNs send to broadcast or multicast */
1405 if (((struct rtable *)skb->dst)->rt_flags &
1406 (RTCF_BROADCAST|RTCF_MULTICAST))
1409 /* TW buckets are converted to open requests without
1410 * limitations, they conserve resources and peer is
1411 * evidently real one.
1413 if (tcp_synq_is_full(sk) && !isn) {
1414 #ifdef CONFIG_SYN_COOKIES
1415 if (sysctl_tcp_syncookies) {
1422 /* Accept backlog is full. If we have already queued enough
1423 * of warm entries in syn queue, drop request. It is better than
1424 * clogging syn queue with openreqs with exponentially increasing
1427 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1430 req = tcp_openreq_alloc();
1434 tcp_clear_options(&tp);
1436 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1438 tcp_parse_options(skb, &tp, 0);
1441 tcp_clear_options(&tp);
1445 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1446 /* Some OSes (unknown ones, but I see them on web server, which
1447 * contains information interesting only for windows'
1448 * users) do not send their stamp in SYN. It is easy case.
1449 * We simply do not advertise TS support.
1454 tp.tstamp_ok = tp.saw_tstamp;
1456 tcp_openreq_init(req, &tp, skb);
1458 req->af.v4_req.loc_addr = daddr;
1459 req->af.v4_req.rmt_addr = saddr;
1460 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1461 req->class = &or_ipv4;
1463 TCP_ECN_create_request(req, skb->h.th);
1466 #ifdef CONFIG_SYN_COOKIES
1467 syn_flood_warning(skb);
1469 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1470 } else if (isn == 0) {
1471 struct inet_peer *peer = NULL;
1473 /* VJ's idea. We save last timestamp seen
1474 * from the destination in peer table, when entering
1475 * state TIME-WAIT, and check against it before
1476 * accepting new connection request.
1478 * If "isn" is not zero, this request hit alive
1479 * timewait bucket, so that all the necessary checks
1480 * are made in the function processing timewait state.
1482 if (tp.saw_tstamp &&
1483 sysctl_tcp_tw_recycle &&
1484 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1485 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1486 peer->v4daddr == saddr) {
1487 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1488 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1489 NET_INC_STATS_BH(PAWSPassiveRejected);
1494 /* Kill the following clause, if you dislike this way. */
1495 else if (!sysctl_tcp_syncookies &&
1496 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1497 < (sysctl_max_syn_backlog>>2)) &&
1498 (!peer || !peer->tcp_ts_stamp) &&
1499 (!dst || !dst->rtt)) {
1500 /* Without syncookies last quarter of
1501 * backlog is filled with destinations, proven to be alive.
1502 * It means that we continue to communicate
1503 * to destinations, already remembered
1504 * to the moment of synflood.
1506 NETDEBUG(if (net_ratelimit()) \
1507 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1508 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1513 isn = tcp_v4_init_sequence(sk, skb);
1517 if (tcp_v4_send_synack(sk, req, dst))
1521 tcp_openreq_free(req);
1523 tcp_v4_synq_add(sk, req);
1528 tcp_openreq_free(req);
1530 TCP_INC_STATS_BH(TcpAttemptFails);
1536 * The three way handshake has completed - we got a valid synack -
1537 * now create the new socket.
1539 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1540 struct open_request *req,
1541 struct dst_entry *dst)
1543 struct tcp_opt *newtp;
1546 if (tcp_acceptq_is_full(sk))
1550 (dst = tcp_v4_route_req(sk, req)) == NULL)
1553 newsk = tcp_create_openreq_child(sk, req, skb);
1557 newsk->dst_cache = dst;
1558 newsk->route_caps = dst->dev->features;
1560 newtp = &(newsk->tp_pinfo.af_tcp);
1561 newsk->daddr = req->af.v4_req.rmt_addr;
1562 newsk->saddr = req->af.v4_req.loc_addr;
1563 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1564 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1565 req->af.v4_req.opt = NULL;
1566 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1567 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1568 newtp->ext_header_len = 0;
1569 if (newsk->protinfo.af_inet.opt)
1570 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1571 newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1573 tcp_sync_mss(newsk, dst->pmtu);
1574 newtp->advmss = dst->advmss;
1575 tcp_initialize_rcv_mss(newsk);
1577 __tcp_v4_hash(newsk, 0);
1578 __tcp_inherit_port(sk, newsk);
1583 NET_INC_STATS_BH(ListenOverflows);
1585 NET_INC_STATS_BH(ListenDrops);
1590 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1592 struct open_request *req, **prev;
1593 struct tcphdr *th = skb->h.th;
1594 struct iphdr *iph = skb->nh.iph;
1595 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1598 /* Find possible connection requests. */
1599 req = tcp_v4_search_req(tp, &prev,
1601 iph->saddr, iph->daddr);
1603 return tcp_check_req(sk, skb, req, prev);
1605 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1612 if (nsk->state != TCP_TIME_WAIT) {
1616 tcp_tw_put((struct tcp_tw_bucket*)nsk);
1620 #ifdef CONFIG_SYN_COOKIES
1621 if (!th->rst && !th->syn && th->ack)
1622 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1627 static int tcp_v4_checksum_init(struct sk_buff *skb)
1629 if (skb->ip_summed == CHECKSUM_HW) {
1630 skb->ip_summed = CHECKSUM_UNNECESSARY;
1631 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1632 skb->nh.iph->daddr,skb->csum))
1635 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1636 skb->ip_summed = CHECKSUM_NONE;
1638 if (skb->len <= 76) {
1639 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1641 skb_checksum(skb, 0, skb->len, 0)))
1643 skb->ip_summed = CHECKSUM_UNNECESSARY;
1645 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1646 skb->nh.iph->daddr,0);
1652 /* The socket must have it's spinlock held when we get
1655 * We have a potential double-lock case here, so even when
1656 * doing backlog processing we use the BH locking scheme.
1657 * This is because we cannot sleep with the original spinlock
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1662 IP_INC_STATS_BH(IpInDelivers);
1664 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1665 TCP_CHECK_TIMER(sk);
1666 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1668 TCP_CHECK_TIMER(sk);
1672 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1675 if (sk->state == TCP_LISTEN) {
1676 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1681 if (tcp_child_process(sk, nsk, skb))
1687 TCP_CHECK_TIMER(sk);
1688 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1690 TCP_CHECK_TIMER(sk);
1694 tcp_v4_send_reset(skb);
1697 /* Be careful here. If this function gets more complicated and
1698 * gcc suffers from register pressure on the x86, sk (in %ebx)
1699 * might be destroyed here. This current version compiles correctly,
1700 * but you have been warned.
1705 TCP_INC_STATS_BH(TcpInErrs);
1713 int tcp_v4_rcv(struct sk_buff *skb)
1719 if (skb->pkt_type!=PACKET_HOST)
1722 /* Count it even if it's bad */
1723 TCP_INC_STATS_BH(TcpInSegs);
1725 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1730 if (th->doff < sizeof(struct tcphdr)/4)
1732 if (!pskb_may_pull(skb, th->doff*4))
1735 /* An explanation is required here, I think.
1736 * Packet length and doff are validated by header prediction,
1737 * provided case of th->doff==0 is elimineted.
1738 * So, we defer the checks. */
1739 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1740 tcp_v4_checksum_init(skb) < 0))
1744 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1745 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1746 skb->len - th->doff*4);
1747 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1748 TCP_SKB_CB(skb)->when = 0;
1749 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1750 TCP_SKB_CB(skb)->sacked = 0;
1752 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1753 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1759 if(!ipsec_sk_policy(sk,skb))
1760 goto discard_and_relse;
1762 if (sk->state == TCP_TIME_WAIT)
1765 if (sk_filter(sk, skb, 0))
1766 goto discard_and_relse;
1772 if (!sk->lock.users) {
1773 if (!tcp_prequeue(sk, skb))
1774 ret = tcp_v4_do_rcv(sk, skb);
1776 sk_add_backlog(sk, skb);
1784 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1786 TCP_INC_STATS_BH(TcpInErrs);
1788 tcp_v4_send_reset(skb);
1792 /* Discard frame. */
1801 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1802 TCP_INC_STATS_BH(TcpInErrs);
1803 tcp_tw_put((struct tcp_tw_bucket *) sk);
1806 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1807 skb, th, skb->len)) {
1812 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1814 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1815 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1816 tcp_tw_put((struct tcp_tw_bucket *)sk);
1820 /* Fall through to ACK */
1823 tcp_v4_timewait_ack(sk, skb);
1827 case TCP_TW_SUCCESS:;
1832 /* With per-bucket locks this operation is not-atomic, so that
1833 * this version is not worse.
1835 static void __tcp_v4_rehash(struct sock *sk)
1837 sk->prot->unhash(sk);
1841 static int tcp_v4_reselect_saddr(struct sock *sk)
1845 __u32 old_saddr = sk->saddr;
1847 __u32 daddr = sk->daddr;
1849 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1850 daddr = sk->protinfo.af_inet.opt->faddr;
1852 /* Query new route. */
1853 err = ip_route_connect(&rt, daddr, 0,
1854 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1859 __sk_dst_set(sk, &rt->u.dst);
1860 sk->route_caps = rt->u.dst.dev->features;
1862 new_saddr = rt->rt_src;
1864 if (new_saddr == old_saddr)
1867 if (sysctl_ip_dynaddr > 1) {
1868 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1869 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1871 NIPQUAD(new_saddr));
1874 sk->saddr = new_saddr;
1875 sk->rcv_saddr = new_saddr;
1877 /* XXX The only one ugly spot where we need to
1878 * XXX really change the sockets identity after
1879 * XXX it has entered the hashes. -DaveM
1881 * Besides that, it does not check for connection
1882 * uniqueness. Wait for troubles.
1884 __tcp_v4_rehash(sk);
1888 int tcp_v4_rebuild_header(struct sock *sk)
1890 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1894 /* Route is OK, nothing to do. */
1900 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1901 daddr = sk->protinfo.af_inet.opt->faddr;
1903 err = ip_route_output(&rt, daddr, sk->saddr,
1904 RT_CONN_FLAGS(sk), sk->bound_dev_if);
1906 __sk_dst_set(sk, &rt->u.dst);
1907 sk->route_caps = rt->u.dst.dev->features;
1911 /* Routing failed... */
1914 if (!sysctl_ip_dynaddr ||
1915 sk->state != TCP_SYN_SENT ||
1916 (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1917 (err = tcp_v4_reselect_saddr(sk)) != 0)
1923 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1925 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1927 sin->sin_family = AF_INET;
1928 sin->sin_addr.s_addr = sk->daddr;
1929 sin->sin_port = sk->dport;
1932 /* VJ's idea. Save last timestamp seen from this destination
1933 * and hold it at least for normal timewait interval to use for duplicate
1934 * segment detection in subsequent connections, before they enter synchronized
1938 int tcp_v4_remember_stamp(struct sock *sk)
1940 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1941 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1942 struct inet_peer *peer = NULL;
1945 if (rt == NULL || rt->rt_dst != sk->daddr) {
1946 peer = inet_getpeer(sk->daddr, 1);
1949 if (rt->peer == NULL)
1950 rt_bind_peer(rt, 1);
1955 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1956 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1957 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1958 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1959 peer->tcp_ts = tp->ts_recent;
1969 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1971 struct inet_peer *peer = NULL;
1973 peer = inet_getpeer(tw->daddr, 1);
1976 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1977 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1978 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1979 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1980 peer->tcp_ts = tw->ts_recent;
1989 struct tcp_func ipv4_specific = {
1992 tcp_v4_rebuild_header,
1993 tcp_v4_conn_request,
1994 tcp_v4_syn_recv_sock,
1995 tcp_v4_remember_stamp,
1996 sizeof(struct iphdr),
2001 sizeof(struct sockaddr_in)
2004 /* NOTE: A lot of things set to zero explicitly by call to
2005 * sk_alloc() so need not be done here.
2007 static int tcp_v4_init_sock(struct sock *sk)
2009 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2011 skb_queue_head_init(&tp->out_of_order_queue);
2012 tcp_init_xmit_timers(sk);
2013 tcp_prequeue_init(tp);
2015 tp->rto = TCP_TIMEOUT_INIT;
2016 tp->mdev = TCP_TIMEOUT_INIT;
2018 /* So many TCP implementations out there (incorrectly) count the
2019 * initial SYN frame in their delayed-ACK and congestion control
2020 * algorithms that we must have the following bandaid to talk
2021 * efficiently to them. -DaveM
2025 /* See draft-stevens-tcpca-spec-01 for discussion of the
2026 * initialization of these values.
2028 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2029 tp->snd_cwnd_clamp = ~0;
2030 tp->mss_cache = 536;
2032 tp->reordering = sysctl_tcp_reordering;
2034 sk->state = TCP_CLOSE;
2036 sk->write_space = tcp_write_space;
2037 sk->use_write_queue = 1;
2039 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2041 sk->sndbuf = sysctl_tcp_wmem[1];
2042 sk->rcvbuf = sysctl_tcp_rmem[1];
2044 atomic_inc(&tcp_sockets_allocated);
2049 static int tcp_v4_destroy_sock(struct sock *sk)
2051 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2053 tcp_clear_xmit_timers(sk);
2055 /* Cleanup up the write buffer. */
2056 tcp_writequeue_purge(sk);
2058 /* Cleans up our, hopefully empty, out_of_order_queue. */
2059 __skb_queue_purge(&tp->out_of_order_queue);
2061 /* Clean prequeue, it must be empty really */
2062 __skb_queue_purge(&tp->ucopy.prequeue);
2064 /* Clean up a referenced TCP bind bucket. */
2065 if(sk->prev != NULL)
2068 /* If sendmsg cached page exists, toss it. */
2069 if (tp->sndmsg_page != NULL)
2070 __free_page(tp->sndmsg_page);
2072 atomic_dec(&tcp_sockets_allocated);
2077 /* Proc filesystem TCP sock list dumping. */
2078 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2080 int ttd = req->expires - jiffies;
2082 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2083 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2085 req->af.v4_req.loc_addr,
2087 req->af.v4_req.rmt_addr,
2088 ntohs(req->rmt_port),
2090 0,0, /* could print option size, but that is af dependent. */
2091 1, /* timers active (only the expire timer) */
2095 0, /* non standard timer */
2096 0, /* open_requests have no inode */
2097 atomic_read(&sk->refcnt),
2102 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2104 unsigned int dest, src;
2107 unsigned long timer_expires;
2108 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2111 src = sp->rcv_saddr;
2112 destp = ntohs(sp->dport);
2113 srcp = ntohs(sp->sport);
2114 if (tp->pending == TCP_TIME_RETRANS) {
2116 timer_expires = tp->timeout;
2117 } else if (tp->pending == TCP_TIME_PROBE0) {
2119 timer_expires = tp->timeout;
2120 } else if (timer_pending(&sp->timer)) {
2122 timer_expires = sp->timer.expires;
2125 timer_expires = jiffies;
2128 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2129 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2130 i, src, srcp, dest, destp, sp->state,
2131 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2132 timer_active, timer_expires-jiffies,
2137 atomic_read(&sp->refcnt), sp,
2138 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2139 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2143 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2145 unsigned int dest, src;
2147 int ttd = tw->ttd - jiffies;
2153 src = tw->rcv_saddr;
2154 destp = ntohs(tw->dport);
2155 srcp = ntohs(tw->sport);
2157 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2158 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2159 i, src, srcp, dest, destp, tw->substate, 0, 0,
2161 atomic_read(&tw->refcnt), tw);
2166 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2168 int len = 0, num = 0, i;
2169 off_t begin, pos = 0;
2170 char tmpbuf[TMPSZ+1];
2173 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2174 " sl local_address rem_address st tx_queue "
2175 "rx_queue tr tm->when retrnsmt uid timeout inode");
2179 /* First, walk listening socket table. */
2181 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2183 struct tcp_listen_opt *lopt;
2186 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2187 struct open_request *req;
2189 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2191 if (!TCP_INET_FAMILY(sk->family))
2195 if (pos >= offset) {
2196 get_tcp_sock(sk, tmpbuf, num);
2197 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2198 if (pos >= offset + length) {
2199 tcp_listen_unlock();
2205 uid = sock_i_uid(sk);
2206 read_lock_bh(&tp->syn_wait_lock);
2207 lopt = tp->listen_opt;
2208 if (lopt && lopt->qlen != 0) {
2209 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2210 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2211 if (!TCP_INET_FAMILY(req->class->family))
2217 get_openreq(sk, req, tmpbuf, num, uid);
2218 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2219 if (pos >= offset + length) {
2220 read_unlock_bh(&tp->syn_wait_lock);
2221 tcp_listen_unlock();
2227 read_unlock_bh(&tp->syn_wait_lock);
2229 /* Completed requests are in normal socket hash table */
2232 tcp_listen_unlock();
2236 /* Next, walk established hash chain. */
2237 for (i = 0; i < tcp_ehash_size; i++) {
2238 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2240 struct tcp_tw_bucket *tw;
2242 read_lock(&head->lock);
2243 for(sk = head->chain; sk; sk = sk->next, num++) {
2244 if (!TCP_INET_FAMILY(sk->family))
2249 get_tcp_sock(sk, tmpbuf, num);
2250 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2251 if (pos >= offset + length) {
2252 read_unlock(&head->lock);
2256 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2258 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2259 if (!TCP_INET_FAMILY(tw->family))
2264 get_timewait_sock(tw, tmpbuf, num);
2265 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2266 if (pos >= offset + length) {
2267 read_unlock(&head->lock);
2271 read_unlock(&head->lock);
2278 begin = len - (pos - offset);
2279 *start = buffer + begin;
2288 struct proto tcp_prot = {
2291 connect: tcp_v4_connect,
2292 disconnect: tcp_disconnect,
2295 init: tcp_v4_init_sock,
2296 destroy: tcp_v4_destroy_sock,
2297 shutdown: tcp_shutdown,
2298 setsockopt: tcp_setsockopt,
2299 getsockopt: tcp_getsockopt,
2300 sendmsg: tcp_sendmsg,
2301 recvmsg: tcp_recvmsg,
2302 backlog_rcv: tcp_v4_do_rcv,
2305 get_port: tcp_v4_get_port,
2310 void __init tcp_v4_init(struct net_proto_family *ops)
2314 tcp_inode.i_mode = S_IFSOCK;
2315 tcp_inode.i_sock = 1;
2316 tcp_inode.i_uid = 0;
2317 tcp_inode.i_gid = 0;
2318 init_waitqueue_head(&tcp_inode.i_wait);
2319 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2321 tcp_socket->inode = &tcp_inode;
2322 tcp_socket->state = SS_UNCONNECTED;
2323 tcp_socket->type=SOCK_RAW;
2325 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2326 panic("Failed to create the TCP control socket.\n");
2327 tcp_socket->sk->allocation=GFP_ATOMIC;
2328 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2330 /* Unhash it so that IP input processing does not even
2331 * see it, we do not wish this socket to see incoming
2334 tcp_socket->sk->prot->unhash(tcp_socket->sk);