2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.1.1.1 2005/04/11 02:51:13 jack Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #include <linux/config.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/init.h>
64 #include <net/inet_common.h>
66 #include <linux/inet.h>
67 #include <linux/stddef.h>
68 #include <linux/ipsec.h>
70 extern int sysctl_ip_dynaddr;
71 extern int sysctl_ip_default_ttl;
72 int sysctl_tcp_tw_reuse = 0;
73 int sysctl_tcp_low_latency = 0;
75 /* Check TCP sequence numbers in ICMP packets. */
76 #define ICMP_MIN_LENGTH 8
78 /* Socket used for sending RSTs */
79 static struct inode tcp_inode;
80 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
82 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
86 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
88 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 __tcp_listening_hash: { NULL, },
94 __tcp_lhash_lock: RW_LOCK_UNLOCKED,
95 __tcp_lhash_users: ATOMIC_INIT(0),
97 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
98 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
102 * This array holds the first and last local port number.
103 * For high-usage systems, use sysctl to change this to
106 int sysctl_local_port_range[2] = { 1024, 4999 };
107 int tcp_port_rover = (1024 - 1);
109 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
110 __u32 faddr, __u16 fport)
112 int h = ((laddr ^ lport) ^ (faddr ^ fport));
115 return h & (tcp_ehash_size - 1);
118 static __inline__ int tcp_sk_hashfn(struct sock *sk)
120 __u32 laddr = sk->rcv_saddr;
121 __u16 lport = sk->num;
122 __u32 faddr = sk->daddr;
123 __u16 fport = sk->dport;
125 return tcp_hashfn(laddr, lport, faddr, fport);
128 /* Allocate and initialize a new TCP local port bind bucket.
129 * The bindhash mutex for snum's hash chain must be held here.
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
134 struct tcp_bind_bucket *tb;
136 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
141 if((tb->next = head->chain) != NULL)
142 tb->next->pprev = &tb->next;
144 tb->pprev = &head->chain;
149 /* Caller must disable local BH processing. */
150 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
152 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
153 struct tcp_bind_bucket *tb;
155 spin_lock(&head->lock);
156 tb = (struct tcp_bind_bucket *)sk->prev;
157 if ((child->bind_next = tb->owners) != NULL)
158 tb->owners->bind_pprev = &child->bind_next;
160 child->bind_pprev = &tb->owners;
161 child->prev = (struct sock *) tb;
162 spin_unlock(&head->lock);
165 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
168 __tcp_inherit_port(sk, child);
172 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
175 if ((sk->bind_next = tb->owners) != NULL)
176 tb->owners->bind_pprev = &sk->bind_next;
178 sk->bind_pprev = &tb->owners;
179 sk->prev = (struct sock *) tb;
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 struct sock *sk2 = tb->owners;
185 int sk_reuse = sk->reuse;
187 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
190 !ipv6_only_sock(sk2) &&
191 sk->bound_dev_if == sk2->bound_dev_if) {
194 sk2->state == TCP_LISTEN) {
195 if (!sk2->rcv_saddr ||
197 (sk2->rcv_saddr == sk->rcv_saddr))
205 /* Obtain a reference to a local port for the given sock,
206 * if snum is zero it means select any available local port.
208 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
210 struct tcp_bind_hashbucket *head;
211 struct tcp_bind_bucket *tb;
216 int low = sysctl_local_port_range[0];
217 int high = sysctl_local_port_range[1];
218 int remaining = (high - low) + 1;
221 spin_lock(&tcp_portalloc_lock);
222 rover = tcp_port_rover;
224 if ((rover < low) || (rover > high))
226 head = &tcp_bhash[tcp_bhashfn(rover)];
227 spin_lock(&head->lock);
228 for (tb = head->chain; tb; tb = tb->next)
229 if (tb->port == rover)
233 spin_unlock(&head->lock);
234 } while (--remaining > 0);
235 tcp_port_rover = rover;
236 spin_unlock(&tcp_portalloc_lock);
238 /* Exhausted local port range during search? */
243 /* OK, here is the one we will use. HEAD is
244 * non-NULL and we hold it's mutex.
249 head = &tcp_bhash[tcp_bhashfn(snum)];
250 spin_lock(&head->lock);
251 for (tb = head->chain; tb != NULL; tb = tb->next)
252 if (tb->port == snum)
255 if (tb != NULL && tb->owners != NULL) {
258 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
262 if (tcp_bind_conflict(sk, tb))
268 (tb = tcp_bucket_create(head, snum)) == NULL)
270 if (tb->owners == NULL) {
271 if (sk->reuse && sk->state != TCP_LISTEN)
275 } else if (tb->fastreuse &&
276 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
279 if (sk->prev == NULL)
280 tcp_bind_hash(sk, tb, snum);
281 BUG_TRAP(sk->prev == (struct sock *) tb);
285 spin_unlock(&head->lock);
291 /* Get rid of any references to a local port held by the
294 __inline__ void __tcp_put_port(struct sock *sk)
296 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
297 struct tcp_bind_bucket *tb;
299 spin_lock(&head->lock);
300 tb = (struct tcp_bind_bucket *) sk->prev;
302 sk->bind_next->bind_pprev = sk->bind_pprev;
303 *(sk->bind_pprev) = sk->bind_next;
306 if (tb->owners == NULL) {
308 tb->next->pprev = tb->pprev;
309 *(tb->pprev) = tb->next;
310 kmem_cache_free(tcp_bucket_cachep, tb);
312 spin_unlock(&head->lock);
315 void tcp_put_port(struct sock *sk)
322 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
323 * Look, when several writers sleep and reader wakes them up, all but one
324 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
325 * this, _but_ remember, it adds useless work on UP machines (wake up each
326 * exclusive lock release). It should be ifdefed really.
329 void tcp_listen_wlock(void)
331 write_lock(&tcp_lhash_lock);
333 if (atomic_read(&tcp_lhash_users)) {
334 DECLARE_WAITQUEUE(wait, current);
336 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
338 set_current_state(TASK_UNINTERRUPTIBLE);
339 if (atomic_read(&tcp_lhash_users) == 0)
341 write_unlock_bh(&tcp_lhash_lock);
343 write_lock_bh(&tcp_lhash_lock);
346 __set_current_state(TASK_RUNNING);
347 remove_wait_queue(&tcp_lhash_wait, &wait);
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
356 BUG_TRAP(sk->pprev==NULL);
357 if(listen_possible && sk->state == TCP_LISTEN) {
358 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
362 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->hashent].lock;
366 if((sk->next = *skp) != NULL)
367 (*skp)->pprev = &sk->next;
370 sock_prot_inc_use(sk->prot);
372 if (listen_possible && sk->state == TCP_LISTEN)
373 wake_up(&tcp_lhash_wait);
376 static void tcp_v4_hash(struct sock *sk)
378 if (sk->state != TCP_CLOSE) {
380 __tcp_v4_hash(sk, 1);
385 void tcp_unhash(struct sock *sk)
392 if (sk->state == TCP_LISTEN) {
395 lock = &tcp_lhash_lock;
397 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
399 write_lock_bh(&head->lock);
404 sk->next->pprev = sk->pprev;
405 *sk->pprev = sk->next;
407 sock_prot_dec_use(sk->prot);
409 write_unlock_bh(lock);
412 if (sk->state == TCP_LISTEN)
413 wake_up(&tcp_lhash_wait);
416 /* Don't inline this cruft. Here are some nice properties to
417 * exploit here. The BSD API does not allow a listening TCP
418 * to specify the remote port nor the remote address for the
419 * connection. So always assume those are both wildcarded
420 * during the search since they can never be otherwise.
422 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
424 struct sock *result = NULL;
428 for(; sk; sk = sk->next) {
429 if(sk->num == hnum && !ipv6_only_sock(sk)) {
430 __u32 rcv_saddr = sk->rcv_saddr;
432 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
433 score = sk->family == PF_INET ? 1 : 0;
438 if (rcv_saddr != daddr)
442 if (sk->bound_dev_if) {
443 if (sk->bound_dev_if != dif)
449 if (score > hiscore) {
458 /* Optimize the common listener case. */
459 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
463 read_lock(&tcp_lhash_lock);
464 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
466 if (sk->num == hnum &&
468 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
469 (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
472 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
478 read_unlock(&tcp_lhash_lock);
482 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
483 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
485 * Local BH must be disabled here.
488 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
489 u32 daddr, u16 hnum, int dif)
491 struct tcp_ehash_bucket *head;
492 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
493 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
497 /* Optimize here for direct hit, only listening connections can
498 * have wildcards anyways.
500 hash = tcp_hashfn(daddr, hnum, saddr, sport);
501 head = &tcp_ehash[hash];
502 read_lock(&head->lock);
503 for(sk = head->chain; sk; sk = sk->next) {
504 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
505 goto hit; /* You sunk my battleship! */
508 /* Must check for a TIME_WAIT'er before going to listener hash. */
509 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
510 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
512 read_unlock(&head->lock);
518 read_unlock(&head->lock);
522 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
523 u32 daddr, u16 hnum, int dif)
527 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
532 return tcp_v4_lookup_listener(daddr, hnum, dif);
535 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
540 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
546 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
548 return secure_tcp_sequence_number(skb->nh.iph->daddr,
554 /* called with local bh disabled */
555 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
556 struct tcp_tw_bucket **twp)
558 u32 daddr = sk->rcv_saddr;
559 u32 saddr = sk->daddr;
560 int dif = sk->bound_dev_if;
561 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
562 __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
563 int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
564 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
565 struct sock *sk2, **skp;
566 struct tcp_tw_bucket *tw;
568 write_lock(&head->lock);
570 /* Check TIME-WAIT sockets first. */
571 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
573 tw = (struct tcp_tw_bucket*)sk2;
575 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
576 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
578 /* With PAWS, it is safe from the viewpoint
579 of data integrity. Even without PAWS it
580 is safe provided sequence spaces do not
581 overlap i.e. at data rates <= 80Mbit/sec.
583 Actually, the idea is close to VJ's one,
584 only timestamp cache is held not per host,
585 but per port pair and TW bucket is used
588 If TW bucket has been already destroyed we
589 fall back to VJ's scheme and use initial
590 timestamp retrieved from peer table.
592 if (tw->ts_recent_stamp &&
593 (!twp || (sysctl_tcp_tw_reuse &&
594 xtime.tv_sec - tw->ts_recent_stamp > 1))) {
595 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
597 tp->ts_recent = tw->ts_recent;
598 tp->ts_recent_stamp = tw->ts_recent_stamp;
608 /* And established part... */
609 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
610 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
615 /* Must record num and sport now. Otherwise we will see
616 * in hash table socket with a funny identity. */
618 sk->sport = htons(lport);
619 BUG_TRAP(sk->pprev==NULL);
620 if ((sk->next = *skp) != NULL)
621 (*skp)->pprev = &sk->next;
626 sock_prot_inc_use(sk->prot);
627 write_unlock(&head->lock);
631 NET_INC_STATS_BH(TimeWaitRecycled);
633 /* Silly. Should hash-dance instead... */
634 tcp_tw_deschedule(tw);
635 tcp_timewait_kill(tw);
636 NET_INC_STATS_BH(TimeWaitRecycled);
644 write_unlock(&head->lock);
645 return -EADDRNOTAVAIL;
649 * Bind a port for a connect operation and hash it.
651 static int tcp_v4_hash_connect(struct sock *sk)
653 unsigned short snum = sk->num;
654 struct tcp_bind_hashbucket *head;
655 struct tcp_bind_bucket *tb;
659 int low = sysctl_local_port_range[0];
660 int high = sysctl_local_port_range[1];
661 int remaining = (high - low) + 1;
662 struct tcp_tw_bucket *tw = NULL;
666 /* TODO. Actually it is not so bad idea to remove
667 * tcp_portalloc_lock before next submission to Linus.
668 * As soon as we touch this place at all it is time to think.
670 * Now it protects single _advisory_ variable tcp_port_rover,
671 * hence it is mostly useless.
672 * Code will work nicely if we just delete it, but
673 * I am afraid in contented case it will work not better or
674 * even worse: another cpu just will hit the same bucket
676 * So some cpu salt could remove both contention and
677 * memory pingpong. Any ideas how to do this in a nice way?
679 spin_lock(&tcp_portalloc_lock);
680 rover = tcp_port_rover;
684 if ((rover < low) || (rover > high))
686 head = &tcp_bhash[tcp_bhashfn(rover)];
687 spin_lock(&head->lock);
689 /* Does not bother with rcv_saddr checks,
690 * because the established check is already
693 for (tb = head->chain; tb; tb = tb->next) {
694 if (tb->port == rover) {
695 BUG_TRAP(tb->owners != NULL);
696 if (tb->fastreuse >= 0)
698 if (!__tcp_v4_check_established(sk, rover, &tw))
704 tb = tcp_bucket_create(head, rover);
706 spin_unlock(&head->lock);
713 spin_unlock(&head->lock);
714 } while (--remaining > 0);
715 tcp_port_rover = rover;
716 spin_unlock(&tcp_portalloc_lock);
720 return -EADDRNOTAVAIL;
723 /* All locks still held and bhs disabled */
724 tcp_port_rover = rover;
725 spin_unlock(&tcp_portalloc_lock);
727 tcp_bind_hash(sk, tb, rover);
729 sk->sport = htons(rover);
730 __tcp_v4_hash(sk, 0);
732 spin_unlock(&head->lock);
735 tcp_tw_deschedule(tw);
736 tcp_timewait_kill(tw);
744 head = &tcp_bhash[tcp_bhashfn(snum)];
745 tb = (struct tcp_bind_bucket *)sk->prev;
746 spin_lock_bh(&head->lock);
747 if (tb->owners == sk && sk->bind_next == NULL) {
748 __tcp_v4_hash(sk, 0);
749 spin_unlock_bh(&head->lock);
753 spin_unlock(&head->lock);
754 /* No definite answer... Walk to established hash table */
755 ret = __tcp_v4_check_established(sk, snum, NULL);
761 /* This will initiate an outgoing connection. */
762 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
764 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
765 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
771 if (addr_len < sizeof(struct sockaddr_in))
774 if (usin->sin_family != AF_INET)
775 return(-EAFNOSUPPORT);
777 nexthop = daddr = usin->sin_addr.s_addr;
778 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
781 nexthop = sk->protinfo.af_inet.opt->faddr;
784 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
785 RT_CONN_FLAGS(sk), sk->bound_dev_if);
789 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
794 __sk_dst_set(sk, &rt->u.dst);
795 sk->route_caps = rt->u.dst.dev->features;
797 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
801 sk->saddr = rt->rt_src;
802 sk->rcv_saddr = sk->saddr;
804 if (tp->ts_recent_stamp && sk->daddr != daddr) {
805 /* Reset inherited state */
807 tp->ts_recent_stamp = 0;
811 if (sysctl_tcp_tw_recycle &&
812 !tp->ts_recent_stamp &&
813 rt->rt_dst == daddr) {
814 struct inet_peer *peer = rt_get_peer(rt);
816 /* VJ's idea. We save last timestamp seen from
817 * the destination in peer table, when entering state TIME-WAIT
818 * and initialize ts_recent from it, when trying new connection.
821 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
822 tp->ts_recent_stamp = peer->tcp_ts_stamp;
823 tp->ts_recent = peer->tcp_ts;
827 sk->dport = usin->sin_port;
830 tp->ext_header_len = 0;
831 if (sk->protinfo.af_inet.opt)
832 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
836 /* Socket identity is still unknown (sport may be zero).
837 * However we set state to SYN-SENT and not releasing socket
838 * lock select source port, enter ourselves into the hash tables and
839 * complete initalization after this.
841 tcp_set_state(sk, TCP_SYN_SENT);
842 err = tcp_v4_hash_connect(sk);
847 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
848 sk->sport, usin->sin_port);
850 sk->protinfo.af_inet.id = tp->write_seq^jiffies;
852 err = tcp_connect(sk);
859 tcp_set_state(sk, TCP_CLOSE);
866 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
868 return ((struct rtable*)skb->dst)->rt_iif;
871 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
873 unsigned h = raddr ^ rport;
876 return h&(TCP_SYNQ_HSIZE-1);
879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880 struct open_request ***prevp,
882 __u32 raddr, __u32 laddr)
884 struct tcp_listen_opt *lopt = tp->listen_opt;
885 struct open_request *req, **prev;
887 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
888 (req = *prev) != NULL;
889 prev = &req->dl_next) {
890 if (req->rmt_port == rport &&
891 req->af.v4_req.rmt_addr == raddr &&
892 req->af.v4_req.loc_addr == laddr &&
893 TCP_INET_FAMILY(req->class->family)) {
894 BUG_TRAP(req->sk == NULL);
903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
905 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906 struct tcp_listen_opt *lopt = tp->listen_opt;
907 unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
909 req->expires = jiffies + TCP_TIMEOUT_INIT;
912 req->dl_next = lopt->syn_table[h];
914 write_lock(&tp->syn_wait_lock);
915 lopt->syn_table[h] = req;
916 write_unlock(&tp->syn_wait_lock);
923 * This routine does path mtu discovery as defined in RFC1191.
925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
927 struct dst_entry *dst;
928 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
930 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931 * send out by Linux are always <576bytes so they should go through
934 if (sk->state == TCP_LISTEN)
937 /* We don't check in the destentry if pmtu discovery is forbidden
938 * on this route. We just assume that no packet_to_big packets
939 * are send back when pmtu discovery is not active.
940 * There is a small race when the user changes this flag in the
941 * route, but I think that's acceptable.
943 if ((dst = __sk_dst_check(sk, 0)) == NULL)
946 ip_rt_update_pmtu(dst, mtu);
948 /* Something is about to be wrong... Remember soft error
949 * for the case, if this connection will not able to recover.
951 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952 sk->err_soft = EMSGSIZE;
954 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955 tp->pmtu_cookie > dst->pmtu) {
956 tcp_sync_mss(sk, dst->pmtu);
958 /* Resend the TCP packet because it's
959 * clear that the old packet has been
960 * dropped. This is the new "fast" path mtu
963 tcp_simple_retransmit(sk);
964 } /* else let the usual retransmit timer handle it */
968 * This routine is called by the ICMP module when it gets some
969 * sort of error condition. If err < 0 then the socket should
970 * be closed and the error returned to the user. If err > 0
971 * it's just the icmp type << 8 | icmp code. After adjustment
972 * header points to the first 8 bytes of the tcp header. We need
973 * to find the appropriate port.
975 * The locking strategy used here is very "optimistic". When
976 * someone else accesses the socket the ICMP is just dropped
977 * and for some paths there is no check at all.
978 * A more general error queue to queue errors for later handling
979 * is probably better.
983 void tcp_v4_err(struct sk_buff *skb, u32 info)
985 struct iphdr *iph = (struct iphdr*)skb->data;
986 struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
988 int type = skb->h.icmph->type;
989 int code = skb->h.icmph->code;
994 if (skb->len < (iph->ihl << 2) + 8) {
995 ICMP_INC_STATS_BH(IcmpInErrors);
999 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1001 ICMP_INC_STATS_BH(IcmpInErrors);
1004 if (sk->state == TCP_TIME_WAIT) {
1005 tcp_tw_put((struct tcp_tw_bucket*)sk);
1010 /* If too many ICMPs get dropped on busy
1011 * servers this needs to be solved differently.
1013 if (sk->lock.users != 0)
1014 NET_INC_STATS_BH(LockDroppedIcmps);
1016 if (sk->state == TCP_CLOSE)
1019 tp = &sk->tp_pinfo.af_tcp;
1020 seq = ntohl(th->seq);
1021 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022 NET_INC_STATS(OutOfWindowIcmps);
1027 case ICMP_SOURCE_QUENCH:
1028 /* This is deprecated, but if someone generated it,
1029 * we have no reasons to ignore it.
1031 if (sk->lock.users == 0)
1034 case ICMP_PARAMETERPROB:
1037 case ICMP_DEST_UNREACH:
1038 if (code > NR_ICMP_UNREACH)
1041 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1042 if (sk->lock.users == 0)
1043 do_pmtu_discovery(sk, iph, info);
1047 err = icmp_err_convert[code].errno;
1049 case ICMP_TIME_EXCEEDED:
1056 switch (sk->state) {
1057 struct open_request *req, **prev;
1059 if (sk->lock.users != 0)
1062 req = tcp_v4_search_req(tp, &prev,
1064 iph->daddr, iph->saddr);
1068 /* ICMPs are not backlogged, hence we cannot get
1069 an established socket here.
1071 BUG_TRAP(req->sk == NULL);
1073 if (seq != req->snt_isn) {
1074 NET_INC_STATS_BH(OutOfWindowIcmps);
1079 * Still in SYN_RECV, just remove it silently.
1080 * There is no good way to pass the error to the newly
1081 * created socket, and POSIX does not want network
1082 * errors returned from accept().
1084 tcp_synq_drop(sk, req, prev);
1088 case TCP_SYN_RECV: /* Cannot happen.
1089 It can f.e. if SYNs crossed.
1091 if (sk->lock.users == 0) {
1092 TCP_INC_STATS_BH(TcpAttemptFails);
1095 sk->error_report(sk);
1104 /* If we've already connected we will keep trying
1105 * until we time out, or the user gives up.
1107 * rfc1122 4.2.3.9 allows to consider as hard errors
1108 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1109 * but it is obsoleted by pmtu discovery).
1111 * Note, that in modern internet, where routing is unreliable
1112 * and in each dark corner broken firewalls sit, sending random
1113 * errors ordered by their masters even this two messages finally lose
1114 * their original sense (even Linux sends invalid PORT_UNREACHs)
1116 * Now we are in compliance with RFCs.
1120 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1122 sk->error_report(sk);
1123 } else { /* Only an error on timeout */
1132 /* This routine computes an IPv4 TCP checksum. */
1133 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1134 struct sk_buff *skb)
1136 if (skb->ip_summed == CHECKSUM_HW) {
1137 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1138 skb->csum = offsetof(struct tcphdr, check);
1140 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1141 csum_partial((char *)th, th->doff<<2, skb->csum));
1146 * This routine will send an RST to the other tcp.
1148 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1150 * Answer: if a packet caused RST, it is not for a socket
1151 * existing in our system, if it is matched to a socket,
1152 * it is just duplicate segment or bug in other side's TCP.
1153 * So that we build reply only basing on parameters
1154 * arrived with segment.
1155 * Exception: precedence violation. We do not implement it in any case.
1158 static void tcp_v4_send_reset(struct sk_buff *skb)
1160 struct tcphdr *th = skb->h.th;
1162 struct ip_reply_arg arg;
1164 /* Never send a reset in response to a reset. */
1168 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1171 /* Swap the send and the receive. */
1172 memset(&rth, 0, sizeof(struct tcphdr));
1173 rth.dest = th->source;
1174 rth.source = th->dest;
1175 rth.doff = sizeof(struct tcphdr)/4;
1179 rth.seq = th->ack_seq;
1182 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1183 + skb->len - (th->doff<<2));
1186 memset(&arg, 0, sizeof arg);
1187 arg.iov[0].iov_base = (unsigned char *)&rth;
1188 arg.iov[0].iov_len = sizeof rth;
1189 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1190 skb->nh.iph->saddr, /*XXX*/
1191 sizeof(struct tcphdr),
1195 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1197 tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1198 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1200 TCP_INC_STATS_BH(TcpOutSegs);
1201 TCP_INC_STATS_BH(TcpOutRsts);
1204 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1205 outside socket context is ugly, certainly. What can I do?
1208 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1210 struct tcphdr *th = skb->h.th;
1215 struct ip_reply_arg arg;
1217 memset(&rep.th, 0, sizeof(struct tcphdr));
1218 memset(&arg, 0, sizeof arg);
1220 arg.iov[0].iov_base = (unsigned char *)&rep;
1221 arg.iov[0].iov_len = sizeof(rep.th);
1224 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1225 (TCPOPT_NOP << 16) |
1226 (TCPOPT_TIMESTAMP << 8) |
1228 rep.tsopt[1] = htonl(tcp_time_stamp);
1229 rep.tsopt[2] = htonl(ts);
1230 arg.iov[0].iov_len = sizeof(rep);
1233 /* Swap the send and the receive. */
1234 rep.th.dest = th->source;
1235 rep.th.source = th->dest;
1236 rep.th.doff = arg.iov[0].iov_len/4;
1237 rep.th.seq = htonl(seq);
1238 rep.th.ack_seq = htonl(ack);
1240 rep.th.window = htons(win);
1242 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1243 skb->nh.iph->saddr, /*XXX*/
1247 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1249 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1251 TCP_INC_STATS_BH(TcpOutSegs);
1254 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1256 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1258 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1259 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1264 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1266 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1270 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1273 struct ip_options *opt;
1275 opt = req->af.v4_req.opt;
1276 if(ip_route_output(&rt, ((opt && opt->srr) ?
1278 req->af.v4_req.rmt_addr),
1279 req->af.v4_req.loc_addr,
1280 RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1281 IP_INC_STATS_BH(IpOutNoRoutes);
1284 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1286 IP_INC_STATS_BH(IpOutNoRoutes);
1293 * Send a SYN-ACK after having received an ACK.
1294 * This still operates on a open_request only, not on a big
1297 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1298 struct dst_entry *dst)
1301 struct sk_buff * skb;
1303 /* First, grab a route. */
1305 (dst = tcp_v4_route_req(sk, req)) == NULL)
1308 skb = tcp_make_synack(sk, dst, req);
1311 struct tcphdr *th = skb->h.th;
1313 th->check = tcp_v4_check(th, skb->len,
1314 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1315 csum_partial((char *)th, skb->len, skb->csum));
1317 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1318 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1319 if (err == NET_XMIT_CN)
1329 * IPv4 open_request destructor.
1331 static void tcp_v4_or_free(struct open_request *req)
1333 if (req->af.v4_req.opt)
1334 kfree(req->af.v4_req.opt);
1337 static inline void syn_flood_warning(struct sk_buff *skb)
1339 static unsigned long warntime;
1341 if (jiffies - warntime > HZ*60) {
1344 "possible SYN flooding on port %d. Sending cookies.\n",
1345 ntohs(skb->h.th->dest));
1350 * Save and compile IPv4 options into the open_request if needed.
1352 static inline struct ip_options *
1353 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1355 struct ip_options *opt = &(IPCB(skb)->opt);
1356 struct ip_options *dopt = NULL;
1358 if (opt && opt->optlen) {
1359 int opt_size = optlength(opt);
1360 dopt = kmalloc(opt_size, GFP_ATOMIC);
1362 if (ip_options_echo(dopt, skb)) {
1372 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1373 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1374 * It would be better to replace it with a global counter for all sockets
1375 * but then some measure against one socket starving all other sockets
1378 * It was 128 by default. Experiments with real servers show, that
1379 * it is absolutely not enough even at 100conn/sec. 256 cures most
1380 * of problems. This value is adjusted to 128 for very small machines
1381 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1382 * Further increasing requires to change hash table size.
1384 int sysctl_max_syn_backlog = 256;
1386 struct or_calltable or_ipv4 = {
1394 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1397 struct open_request *req;
1398 __u32 saddr = skb->nh.iph->saddr;
1399 __u32 daddr = skb->nh.iph->daddr;
1400 __u32 isn = TCP_SKB_CB(skb)->when;
1401 struct dst_entry *dst = NULL;
1402 #ifdef CONFIG_SYN_COOKIES
1403 int want_cookie = 0;
1405 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1408 /* Never answer to SYNs send to broadcast or multicast */
1409 if (((struct rtable *)skb->dst)->rt_flags &
1410 (RTCF_BROADCAST|RTCF_MULTICAST))
1413 /* TW buckets are converted to open requests without
1414 * limitations, they conserve resources and peer is
1415 * evidently real one.
1417 if (tcp_synq_is_full(sk) && !isn) {
1418 #ifdef CONFIG_SYN_COOKIES
1419 if (sysctl_tcp_syncookies) {
1426 /* Accept backlog is full. If we have already queued enough
1427 * of warm entries in syn queue, drop request. It is better than
1428 * clogging syn queue with openreqs with exponentially increasing
1431 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1434 req = tcp_openreq_alloc();
1438 tcp_clear_options(&tp);
1440 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1442 tcp_parse_options(skb, &tp, 0);
1445 tcp_clear_options(&tp);
1449 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1450 /* Some OSes (unknown ones, but I see them on web server, which
1451 * contains information interesting only for windows'
1452 * users) do not send their stamp in SYN. It is easy case.
1453 * We simply do not advertise TS support.
1458 tp.tstamp_ok = tp.saw_tstamp;
1460 tcp_openreq_init(req, &tp, skb);
1462 req->af.v4_req.loc_addr = daddr;
1463 req->af.v4_req.rmt_addr = saddr;
1464 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1465 req->class = &or_ipv4;
1467 TCP_ECN_create_request(req, skb->h.th);
1470 #ifdef CONFIG_SYN_COOKIES
1471 syn_flood_warning(skb);
1473 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1474 } else if (isn == 0) {
1475 struct inet_peer *peer = NULL;
1477 /* VJ's idea. We save last timestamp seen
1478 * from the destination in peer table, when entering
1479 * state TIME-WAIT, and check against it before
1480 * accepting new connection request.
1482 * If "isn" is not zero, this request hit alive
1483 * timewait bucket, so that all the necessary checks
1484 * are made in the function processing timewait state.
1486 if (tp.saw_tstamp &&
1487 sysctl_tcp_tw_recycle &&
1488 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1489 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1490 peer->v4daddr == saddr) {
1491 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1492 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1493 NET_INC_STATS_BH(PAWSPassiveRejected);
1498 /* Kill the following clause, if you dislike this way. */
1499 else if (!sysctl_tcp_syncookies &&
1500 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1501 < (sysctl_max_syn_backlog>>2)) &&
1502 (!peer || !peer->tcp_ts_stamp) &&
1503 (!dst || !dst->rtt)) {
1504 /* Without syncookies last quarter of
1505 * backlog is filled with destinations, proven to be alive.
1506 * It means that we continue to communicate
1507 * to destinations, already remembered
1508 * to the moment of synflood.
1510 NETDEBUG(if (net_ratelimit()) \
1511 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1512 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1517 isn = tcp_v4_init_sequence(sk, skb);
1521 if (tcp_v4_send_synack(sk, req, dst))
1525 tcp_openreq_free(req);
1527 tcp_v4_synq_add(sk, req);
1532 tcp_openreq_free(req);
1534 TCP_INC_STATS_BH(TcpAttemptFails);
1540 * The three way handshake has completed - we got a valid synack -
1541 * now create the new socket.
1543 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1544 struct open_request *req,
1545 struct dst_entry *dst)
1547 struct tcp_opt *newtp;
1550 if (tcp_acceptq_is_full(sk))
1554 (dst = tcp_v4_route_req(sk, req)) == NULL)
1557 newsk = tcp_create_openreq_child(sk, req, skb);
1561 newsk->dst_cache = dst;
1562 newsk->route_caps = dst->dev->features;
1564 newtp = &(newsk->tp_pinfo.af_tcp);
1565 newsk->daddr = req->af.v4_req.rmt_addr;
1566 newsk->saddr = req->af.v4_req.loc_addr;
1567 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1568 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1569 req->af.v4_req.opt = NULL;
1570 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1571 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1572 newtp->ext_header_len = 0;
1573 if (newsk->protinfo.af_inet.opt)
1574 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1575 newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1577 tcp_sync_mss(newsk, dst->pmtu);
1578 newtp->advmss = dst->advmss;
1579 tcp_initialize_rcv_mss(newsk);
1581 __tcp_v4_hash(newsk, 0);
1582 __tcp_inherit_port(sk, newsk);
1587 NET_INC_STATS_BH(ListenOverflows);
1589 NET_INC_STATS_BH(ListenDrops);
1594 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1596 struct open_request *req, **prev;
1597 struct tcphdr *th = skb->h.th;
1598 struct iphdr *iph = skb->nh.iph;
1599 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1602 /* Find possible connection requests. */
1603 req = tcp_v4_search_req(tp, &prev,
1605 iph->saddr, iph->daddr);
1607 return tcp_check_req(sk, skb, req, prev);
1609 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1616 if (nsk->state != TCP_TIME_WAIT) {
1620 tcp_tw_put((struct tcp_tw_bucket*)nsk);
1624 #ifdef CONFIG_SYN_COOKIES
1625 if (!th->rst && !th->syn && th->ack)
1626 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1631 static int tcp_v4_checksum_init(struct sk_buff *skb)
1633 if (skb->ip_summed == CHECKSUM_HW) {
1634 skb->ip_summed = CHECKSUM_UNNECESSARY;
1635 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1636 skb->nh.iph->daddr,skb->csum))
1639 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1640 skb->ip_summed = CHECKSUM_NONE;
1642 if (skb->len <= 76) {
1643 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1645 skb_checksum(skb, 0, skb->len, 0)))
1647 skb->ip_summed = CHECKSUM_UNNECESSARY;
1649 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1650 skb->nh.iph->daddr,0);
1656 /* The socket must have it's spinlock held when we get
1659 * We have a potential double-lock case here, so even when
1660 * doing backlog processing we use the BH locking scheme.
1661 * This is because we cannot sleep with the original spinlock
1664 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1666 #ifdef CONFIG_FILTER
1667 struct sk_filter *filter = sk->filter;
1668 if (filter && sk_filter(skb, filter))
1670 #endif /* CONFIG_FILTER */
1672 IP_INC_STATS_BH(IpInDelivers);
1674 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1675 TCP_CHECK_TIMER(sk);
1676 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1678 TCP_CHECK_TIMER(sk);
1682 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1685 if (sk->state == TCP_LISTEN) {
1686 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1691 if (tcp_child_process(sk, nsk, skb))
1697 TCP_CHECK_TIMER(sk);
1698 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1700 TCP_CHECK_TIMER(sk);
1704 tcp_v4_send_reset(skb);
1707 /* Be careful here. If this function gets more complicated and
1708 * gcc suffers from register pressure on the x86, sk (in %ebx)
1709 * might be destroyed here. This current version compiles correctly,
1710 * but you have been warned.
1715 TCP_INC_STATS_BH(TcpInErrs);
1723 int tcp_v4_rcv(struct sk_buff *skb)
1729 if (skb->pkt_type!=PACKET_HOST)
1732 /* Count it even if it's bad */
1733 TCP_INC_STATS_BH(TcpInSegs);
1735 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1740 if (th->doff < sizeof(struct tcphdr)/4)
1742 if (!pskb_may_pull(skb, th->doff*4))
1745 /* An explanation is required here, I think.
1746 * Packet length and doff are validated by header prediction,
1747 * provided case of th->doff==0 is elimineted.
1748 * So, we defer the checks. */
1749 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1750 tcp_v4_checksum_init(skb) < 0))
1754 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1755 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1756 skb->len - th->doff*4);
1757 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1758 TCP_SKB_CB(skb)->when = 0;
1759 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1760 TCP_SKB_CB(skb)->sacked = 0;
1762 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1763 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1769 if(!ipsec_sk_policy(sk,skb))
1770 goto discard_and_relse;
1772 if (sk->state == TCP_TIME_WAIT)
1779 if (!sk->lock.users) {
1780 if (!tcp_prequeue(sk, skb))
1781 ret = tcp_v4_do_rcv(sk, skb);
1783 sk_add_backlog(sk, skb);
1791 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1793 TCP_INC_STATS_BH(TcpInErrs);
1795 tcp_v4_send_reset(skb);
1799 /* Discard frame. */
1808 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1809 TCP_INC_STATS_BH(TcpInErrs);
1810 goto discard_and_relse;
1812 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1813 skb, th, skb->len)) {
1818 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1820 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1821 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1822 tcp_tw_put((struct tcp_tw_bucket *)sk);
1826 /* Fall through to ACK */
1829 tcp_v4_timewait_ack(sk, skb);
1833 case TCP_TW_SUCCESS:;
1838 /* With per-bucket locks this operation is not-atomic, so that
1839 * this version is not worse.
1841 static void __tcp_v4_rehash(struct sock *sk)
1843 sk->prot->unhash(sk);
1847 static int tcp_v4_reselect_saddr(struct sock *sk)
1851 __u32 old_saddr = sk->saddr;
1853 __u32 daddr = sk->daddr;
1855 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1856 daddr = sk->protinfo.af_inet.opt->faddr;
1858 /* Query new route. */
1859 err = ip_route_connect(&rt, daddr, 0,
1860 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1865 __sk_dst_set(sk, &rt->u.dst);
1866 sk->route_caps = rt->u.dst.dev->features;
1868 new_saddr = rt->rt_src;
1870 if (new_saddr == old_saddr)
1873 if (sysctl_ip_dynaddr > 1) {
1874 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1875 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1877 NIPQUAD(new_saddr));
1880 sk->saddr = new_saddr;
1881 sk->rcv_saddr = new_saddr;
1883 /* XXX The only one ugly spot where we need to
1884 * XXX really change the sockets identity after
1885 * XXX it has entered the hashes. -DaveM
1887 * Besides that, it does not check for connection
1888 * uniqueness. Wait for troubles.
1890 __tcp_v4_rehash(sk);
1894 int tcp_v4_rebuild_header(struct sock *sk)
1896 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1900 /* Route is OK, nothing to do. */
1906 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1907 daddr = sk->protinfo.af_inet.opt->faddr;
1909 err = ip_route_output(&rt, daddr, sk->saddr,
1910 RT_CONN_FLAGS(sk), sk->bound_dev_if);
1912 __sk_dst_set(sk, &rt->u.dst);
1913 sk->route_caps = rt->u.dst.dev->features;
1917 /* Routing failed... */
1920 if (!sysctl_ip_dynaddr ||
1921 sk->state != TCP_SYN_SENT ||
1922 (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1923 (err = tcp_v4_reselect_saddr(sk)) != 0)
1929 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1931 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1933 sin->sin_family = AF_INET;
1934 sin->sin_addr.s_addr = sk->daddr;
1935 sin->sin_port = sk->dport;
1938 /* VJ's idea. Save last timestamp seen from this destination
1939 * and hold it at least for normal timewait interval to use for duplicate
1940 * segment detection in subsequent connections, before they enter synchronized
1944 int tcp_v4_remember_stamp(struct sock *sk)
1946 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1947 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1948 struct inet_peer *peer = NULL;
1951 if (rt == NULL || rt->rt_dst != sk->daddr) {
1952 peer = inet_getpeer(sk->daddr, 1);
1955 if (rt->peer == NULL)
1956 rt_bind_peer(rt, 1);
1961 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1962 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1963 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1964 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1965 peer->tcp_ts = tp->ts_recent;
1975 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1977 struct inet_peer *peer = NULL;
1979 peer = inet_getpeer(tw->daddr, 1);
1982 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1983 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1984 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1985 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1986 peer->tcp_ts = tw->ts_recent;
1995 struct tcp_func ipv4_specific = {
1998 tcp_v4_rebuild_header,
1999 tcp_v4_conn_request,
2000 tcp_v4_syn_recv_sock,
2001 tcp_v4_remember_stamp,
2002 sizeof(struct iphdr),
2007 sizeof(struct sockaddr_in)
2010 /* NOTE: A lot of things set to zero explicitly by call to
2011 * sk_alloc() so need not be done here.
2013 static int tcp_v4_init_sock(struct sock *sk)
2015 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2017 skb_queue_head_init(&tp->out_of_order_queue);
2018 tcp_init_xmit_timers(sk);
2019 tcp_prequeue_init(tp);
2021 tp->rto = TCP_TIMEOUT_INIT;
2022 tp->mdev = TCP_TIMEOUT_INIT;
2024 /* So many TCP implementations out there (incorrectly) count the
2025 * initial SYN frame in their delayed-ACK and congestion control
2026 * algorithms that we must have the following bandaid to talk
2027 * efficiently to them. -DaveM
2031 /* See draft-stevens-tcpca-spec-01 for discussion of the
2032 * initialization of these values.
2034 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2035 tp->snd_cwnd_clamp = ~0;
2036 tp->mss_cache = 536;
2038 tp->reordering = sysctl_tcp_reordering;
2040 sk->state = TCP_CLOSE;
2042 sk->write_space = tcp_write_space;
2043 sk->use_write_queue = 1;
2045 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2047 sk->sndbuf = sysctl_tcp_wmem[1];
2048 sk->rcvbuf = sysctl_tcp_rmem[1];
2050 atomic_inc(&tcp_sockets_allocated);
2055 static int tcp_v4_destroy_sock(struct sock *sk)
2057 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2059 tcp_clear_xmit_timers(sk);
2061 /* Cleanup up the write buffer. */
2062 tcp_writequeue_purge(sk);
2064 /* Cleans up our, hopefully empty, out_of_order_queue. */
2065 __skb_queue_purge(&tp->out_of_order_queue);
2067 /* Clean prequeue, it must be empty really */
2068 __skb_queue_purge(&tp->ucopy.prequeue);
2070 /* Clean up a referenced TCP bind bucket. */
2071 if(sk->prev != NULL)
2074 /* If sendmsg cached page exists, toss it. */
2075 if (tp->sndmsg_page != NULL)
2076 __free_page(tp->sndmsg_page);
2078 atomic_dec(&tcp_sockets_allocated);
2083 /* Proc filesystem TCP sock list dumping. */
2084 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2086 int ttd = req->expires - jiffies;
2088 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2089 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2091 req->af.v4_req.loc_addr,
2093 req->af.v4_req.rmt_addr,
2094 ntohs(req->rmt_port),
2096 0,0, /* could print option size, but that is af dependent. */
2097 1, /* timers active (only the expire timer) */
2101 0, /* non standard timer */
2102 0, /* open_requests have no inode */
2103 atomic_read(&sk->refcnt),
2108 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2110 unsigned int dest, src;
2113 unsigned long timer_expires;
2114 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2117 src = sp->rcv_saddr;
2118 destp = ntohs(sp->dport);
2119 srcp = ntohs(sp->sport);
2120 if (tp->pending == TCP_TIME_RETRANS) {
2122 timer_expires = tp->timeout;
2123 } else if (tp->pending == TCP_TIME_PROBE0) {
2125 timer_expires = tp->timeout;
2126 } else if (timer_pending(&sp->timer)) {
2128 timer_expires = sp->timer.expires;
2131 timer_expires = jiffies;
2134 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2135 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2136 i, src, srcp, dest, destp, sp->state,
2137 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2138 timer_active, timer_expires-jiffies,
2143 atomic_read(&sp->refcnt), sp,
2144 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2145 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2149 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2151 unsigned int dest, src;
2153 int ttd = tw->ttd - jiffies;
2159 src = tw->rcv_saddr;
2160 destp = ntohs(tw->dport);
2161 srcp = ntohs(tw->sport);
2163 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2164 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2165 i, src, srcp, dest, destp, tw->substate, 0, 0,
2167 atomic_read(&tw->refcnt), tw);
2172 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2174 int len = 0, num = 0, i;
2175 off_t begin, pos = 0;
2176 char tmpbuf[TMPSZ+1];
2179 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2180 " sl local_address rem_address st tx_queue "
2181 "rx_queue tr tm->when retrnsmt uid timeout inode");
2185 /* First, walk listening socket table. */
2187 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2189 struct tcp_listen_opt *lopt;
2192 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2193 struct open_request *req;
2195 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2197 if (!TCP_INET_FAMILY(sk->family))
2201 if (pos >= offset) {
2202 get_tcp_sock(sk, tmpbuf, num);
2203 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2204 if (pos >= offset + length) {
2205 tcp_listen_unlock();
2211 uid = sock_i_uid(sk);
2212 read_lock_bh(&tp->syn_wait_lock);
2213 lopt = tp->listen_opt;
2214 if (lopt && lopt->qlen != 0) {
2215 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2216 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2217 if (!TCP_INET_FAMILY(req->class->family))
2223 get_openreq(sk, req, tmpbuf, num, uid);
2224 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2225 if (pos >= offset + length) {
2226 read_unlock_bh(&tp->syn_wait_lock);
2227 tcp_listen_unlock();
2233 read_unlock_bh(&tp->syn_wait_lock);
2235 /* Completed requests are in normal socket hash table */
2238 tcp_listen_unlock();
2242 /* Next, walk established hash chain. */
2243 for (i = 0; i < tcp_ehash_size; i++) {
2244 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2246 struct tcp_tw_bucket *tw;
2248 read_lock(&head->lock);
2249 for(sk = head->chain; sk; sk = sk->next, num++) {
2250 if (!TCP_INET_FAMILY(sk->family))
2255 get_tcp_sock(sk, tmpbuf, num);
2256 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2257 if (pos >= offset + length) {
2258 read_unlock(&head->lock);
2262 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2264 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2265 if (!TCP_INET_FAMILY(tw->family))
2270 get_timewait_sock(tw, tmpbuf, num);
2271 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2272 if (pos >= offset + length) {
2273 read_unlock(&head->lock);
2277 read_unlock(&head->lock);
2284 begin = len - (pos - offset);
2285 *start = buffer + begin;
2294 struct proto tcp_prot = {
2297 connect: tcp_v4_connect,
2298 disconnect: tcp_disconnect,
2301 init: tcp_v4_init_sock,
2302 destroy: tcp_v4_destroy_sock,
2303 shutdown: tcp_shutdown,
2304 setsockopt: tcp_setsockopt,
2305 getsockopt: tcp_getsockopt,
2306 sendmsg: tcp_sendmsg,
2307 recvmsg: tcp_recvmsg,
2308 backlog_rcv: tcp_v4_do_rcv,
2311 get_port: tcp_v4_get_port,
2316 void __init tcp_v4_init(struct net_proto_family *ops)
2320 tcp_inode.i_mode = S_IFSOCK;
2321 tcp_inode.i_sock = 1;
2322 tcp_inode.i_uid = 0;
2323 tcp_inode.i_gid = 0;
2324 init_waitqueue_head(&tcp_inode.i_wait);
2325 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2327 tcp_socket->inode = &tcp_inode;
2328 tcp_socket->state = SS_UNCONNECTED;
2329 tcp_socket->type=SOCK_RAW;
2331 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2332 panic("Failed to create the TCP control socket.\n");
2333 tcp_socket->sk->allocation=GFP_ATOMIC;
2334 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2336 /* Unhash it so that IP input processing does not even
2337 * see it, we do not wish this socket to see incoming
2340 tcp_socket->sk->prot->unhash(tcp_socket->sk);