[INET]: Generalise tcp_v4_hash & tcp_unhash
[powerpc.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97         .port_rover     = 1024 - 1,
98 };
99
100 /*
101  * This array holds the first and last local port number.
102  * For high-usage systems, use sysctl to change this to
103  * 32768-61000
104  */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108 {
109         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
110         struct sock *sk2;
111         struct hlist_node *node;
112         int reuse = sk->sk_reuse;
113
114         sk_for_each_bound(sk2, node, &tb->owners) {
115                 if (sk != sk2 &&
116                     !tcp_v6_ipv6only(sk2) &&
117                     (!sk->sk_bound_dev_if ||
118                      !sk2->sk_bound_dev_if ||
119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120                         if (!reuse || !sk2->sk_reuse ||
121                             sk2->sk_state == TCP_LISTEN) {
122                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124                                     sk2_rcv_saddr == sk_rcv_saddr)
125                                         break;
126                         }
127                 }
128         }
129         return node != NULL;
130 }
131
132 /* Obtain a reference to a local port for the given sock,
133  * if snum is zero it means select any available local port.
134  */
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136 {
137         struct inet_bind_hashbucket *head;
138         struct hlist_node *node;
139         struct inet_bind_bucket *tb;
140         int ret;
141
142         local_bh_disable();
143         if (!snum) {
144                 int low = sysctl_local_port_range[0];
145                 int high = sysctl_local_port_range[1];
146                 int remaining = (high - low) + 1;
147                 int rover;
148
149                 spin_lock(&tcp_hashinfo.portalloc_lock);
150                 if (tcp_hashinfo.port_rover < low)
151                         rover = low;
152                 else
153                         rover = tcp_hashinfo.port_rover;
154                 do {
155                         rover++;
156                         if (rover > high)
157                                 rover = low;
158                         head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159                         spin_lock(&head->lock);
160                         inet_bind_bucket_for_each(tb, node, &head->chain)
161                                 if (tb->port == rover)
162                                         goto next;
163                         break;
164                 next:
165                         spin_unlock(&head->lock);
166                 } while (--remaining > 0);
167                 tcp_hashinfo.port_rover = rover;
168                 spin_unlock(&tcp_hashinfo.portalloc_lock);
169
170                 /* Exhausted local port range during search?  It is not
171                  * possible for us to be holding one of the bind hash
172                  * locks if this test triggers, because if 'remaining'
173                  * drops to zero, we broke out of the do/while loop at
174                  * the top level, not from the 'break;' statement.
175                  */
176                 ret = 1;
177                 if (unlikely(remaining <= 0))
178                         goto fail;
179
180                 /* OK, here is the one we will use.  HEAD is
181                  * non-NULL and we hold it's mutex.
182                  */
183                 snum = rover;
184         } else {
185                 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186                 spin_lock(&head->lock);
187                 inet_bind_bucket_for_each(tb, node, &head->chain)
188                         if (tb->port == snum)
189                                 goto tb_found;
190         }
191         tb = NULL;
192         goto tb_not_found;
193 tb_found:
194         if (!hlist_empty(&tb->owners)) {
195                 if (sk->sk_reuse > 1)
196                         goto success;
197                 if (tb->fastreuse > 0 &&
198                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199                         goto success;
200                 } else {
201                         ret = 1;
202                         if (tcp_bind_conflict(sk, tb))
203                                 goto fail_unlock;
204                 }
205         }
206 tb_not_found:
207         ret = 1;
208         if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
209                 goto fail_unlock;
210         if (hlist_empty(&tb->owners)) {
211                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212                         tb->fastreuse = 1;
213                 else
214                         tb->fastreuse = 0;
215         } else if (tb->fastreuse &&
216                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217                 tb->fastreuse = 0;
218 success:
219         if (!inet_sk(sk)->bind_hash)
220                 inet_bind_hash(sk, tb, snum);
221         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
222         ret = 0;
223
224 fail_unlock:
225         spin_unlock(&head->lock);
226 fail:
227         local_bh_enable();
228         return ret;
229 }
230
231 static void tcp_v4_hash(struct sock *sk)
232 {
233         inet_hash(&tcp_hashinfo, sk);
234 }
235
236 void tcp_unhash(struct sock *sk)
237 {
238         inet_unhash(&tcp_hashinfo, sk);
239 }
240
241 /* Don't inline this cruft.  Here are some nice properties to
242  * exploit here.  The BSD API does not allow a listening TCP
243  * to specify the remote port nor the remote address for the
244  * connection.  So always assume those are both wildcarded
245  * during the search since they can never be otherwise.
246  */
247 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
248                                              const u32 daddr,
249                                              const unsigned short hnum,
250                                              const int dif)
251 {
252         struct sock *result = NULL, *sk;
253         struct hlist_node *node;
254         int score, hiscore;
255
256         hiscore=-1;
257         sk_for_each(sk, node, head) {
258                 struct inet_sock *inet = inet_sk(sk);
259
260                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
261                         __u32 rcv_saddr = inet->rcv_saddr;
262
263                         score = (sk->sk_family == PF_INET ? 1 : 0);
264                         if (rcv_saddr) {
265                                 if (rcv_saddr != daddr)
266                                         continue;
267                                 score+=2;
268                         }
269                         if (sk->sk_bound_dev_if) {
270                                 if (sk->sk_bound_dev_if != dif)
271                                         continue;
272                                 score+=2;
273                         }
274                         if (score == 5)
275                                 return sk;
276                         if (score > hiscore) {
277                                 hiscore = score;
278                                 result = sk;
279                         }
280                 }
281         }
282         return result;
283 }
284
285 /* Optimize the common listener case. */
286 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
287                                                   const unsigned short hnum,
288                                                   const int dif)
289 {
290         struct sock *sk = NULL;
291         struct hlist_head *head;
292
293         read_lock(&tcp_hashinfo.lhash_lock);
294         head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
295         if (!hlist_empty(head)) {
296                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
297
298                 if (inet->num == hnum && !sk->sk_node.next &&
299                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
300                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
301                     !sk->sk_bound_dev_if)
302                         goto sherry_cache;
303                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
304         }
305         if (sk) {
306 sherry_cache:
307                 sock_hold(sk);
308         }
309         read_unlock(&tcp_hashinfo.lhash_lock);
310         return sk;
311 }
312
313 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
314  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
315  *
316  * Local BH must be disabled here.
317  */
318
319 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
320                                                        const u16 sport,
321                                                        const u32 daddr,
322                                                        const u16 hnum,
323                                                        const int dif)
324 {
325         struct inet_ehash_bucket *head;
326         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
327         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
328         struct sock *sk;
329         struct hlist_node *node;
330         /* Optimize here for direct hit, only listening connections can
331          * have wildcards anyways.
332          */
333         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
334         head = &tcp_hashinfo.ehash[hash];
335         read_lock(&head->lock);
336         sk_for_each(sk, node, &head->chain) {
337                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
338                         goto hit; /* You sunk my battleship! */
339         }
340
341         /* Must check for a TIME_WAIT'er before going to listener hash. */
342         sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
343                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
344                         goto hit;
345         }
346         sk = NULL;
347 out:
348         read_unlock(&head->lock);
349         return sk;
350 hit:
351         sock_hold(sk);
352         goto out;
353 }
354
355 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
356                                            u32 daddr, u16 hnum, int dif)
357 {
358         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
359                                                       daddr, hnum, dif);
360
361         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
362 }
363
364 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
365                                   u16 dport, int dif)
366 {
367         struct sock *sk;
368
369         local_bh_disable();
370         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
371         local_bh_enable();
372
373         return sk;
374 }
375
376 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
377
378 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
379 {
380         return secure_tcp_sequence_number(skb->nh.iph->daddr,
381                                           skb->nh.iph->saddr,
382                                           skb->h.th->dest,
383                                           skb->h.th->source);
384 }
385
386 /* called with local bh disabled */
387 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
388                                       struct tcp_tw_bucket **twp)
389 {
390         struct inet_sock *inet = inet_sk(sk);
391         u32 daddr = inet->rcv_saddr;
392         u32 saddr = inet->daddr;
393         int dif = sk->sk_bound_dev_if;
394         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
395         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
396         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
397         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
398         struct sock *sk2;
399         struct hlist_node *node;
400         struct tcp_tw_bucket *tw;
401
402         write_lock(&head->lock);
403
404         /* Check TIME-WAIT sockets first. */
405         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
406                 tw = (struct tcp_tw_bucket *)sk2;
407
408                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
409                         struct tcp_sock *tp = tcp_sk(sk);
410
411                         /* With PAWS, it is safe from the viewpoint
412                            of data integrity. Even without PAWS it
413                            is safe provided sequence spaces do not
414                            overlap i.e. at data rates <= 80Mbit/sec.
415
416                            Actually, the idea is close to VJ's one,
417                            only timestamp cache is held not per host,
418                            but per port pair and TW bucket is used
419                            as state holder.
420
421                            If TW bucket has been already destroyed we
422                            fall back to VJ's scheme and use initial
423                            timestamp retrieved from peer table.
424                          */
425                         if (tw->tw_ts_recent_stamp &&
426                             (!twp || (sysctl_tcp_tw_reuse &&
427                                       xtime.tv_sec -
428                                       tw->tw_ts_recent_stamp > 1))) {
429                                 if ((tp->write_seq =
430                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
431                                         tp->write_seq = 1;
432                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
433                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
434                                 sock_hold(sk2);
435                                 goto unique;
436                         } else
437                                 goto not_unique;
438                 }
439         }
440         tw = NULL;
441
442         /* And established part... */
443         sk_for_each(sk2, node, &head->chain) {
444                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
445                         goto not_unique;
446         }
447
448 unique:
449         /* Must record num and sport now. Otherwise we will see
450          * in hash table socket with a funny identity. */
451         inet->num = lport;
452         inet->sport = htons(lport);
453         sk->sk_hashent = hash;
454         BUG_TRAP(sk_unhashed(sk));
455         __sk_add_node(sk, &head->chain);
456         sock_prot_inc_use(sk->sk_prot);
457         write_unlock(&head->lock);
458
459         if (twp) {
460                 *twp = tw;
461                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
462         } else if (tw) {
463                 /* Silly. Should hash-dance instead... */
464                 tcp_tw_deschedule(tw);
465                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
466
467                 tcp_tw_put(tw);
468         }
469
470         return 0;
471
472 not_unique:
473         write_unlock(&head->lock);
474         return -EADDRNOTAVAIL;
475 }
476
477 static inline u32 connect_port_offset(const struct sock *sk)
478 {
479         const struct inet_sock *inet = inet_sk(sk);
480
481         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
482                                          inet->dport);
483 }
484
485 /*
486  * Bind a port for a connect operation and hash it.
487  */
488 static inline int tcp_v4_hash_connect(struct sock *sk)
489 {
490         const unsigned short snum = inet_sk(sk)->num;
491         struct inet_bind_hashbucket *head;
492         struct inet_bind_bucket *tb;
493         int ret;
494
495         if (!snum) {
496                 int low = sysctl_local_port_range[0];
497                 int high = sysctl_local_port_range[1];
498                 int range = high - low;
499                 int i;
500                 int port;
501                 static u32 hint;
502                 u32 offset = hint + connect_port_offset(sk);
503                 struct hlist_node *node;
504                 struct tcp_tw_bucket *tw = NULL;
505
506                 local_bh_disable();
507                 for (i = 1; i <= range; i++) {
508                         port = low + (i + offset) % range;
509                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
510                         spin_lock(&head->lock);
511
512                         /* Does not bother with rcv_saddr checks,
513                          * because the established check is already
514                          * unique enough.
515                          */
516                         inet_bind_bucket_for_each(tb, node, &head->chain) {
517                                 if (tb->port == port) {
518                                         BUG_TRAP(!hlist_empty(&tb->owners));
519                                         if (tb->fastreuse >= 0)
520                                                 goto next_port;
521                                         if (!__tcp_v4_check_established(sk,
522                                                                         port,
523                                                                         &tw))
524                                                 goto ok;
525                                         goto next_port;
526                                 }
527                         }
528
529                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
530                         if (!tb) {
531                                 spin_unlock(&head->lock);
532                                 break;
533                         }
534                         tb->fastreuse = -1;
535                         goto ok;
536
537                 next_port:
538                         spin_unlock(&head->lock);
539                 }
540                 local_bh_enable();
541
542                 return -EADDRNOTAVAIL;
543
544 ok:
545                 hint += i;
546
547                 /* Head lock still held and bh's disabled */
548                 inet_bind_hash(sk, tb, port);
549                 if (sk_unhashed(sk)) {
550                         inet_sk(sk)->sport = htons(port);
551                         __inet_hash(&tcp_hashinfo, sk, 0);
552                 }
553                 spin_unlock(&head->lock);
554
555                 if (tw) {
556                         tcp_tw_deschedule(tw);
557                         tcp_tw_put(tw);
558                 }
559
560                 ret = 0;
561                 goto out;
562         }
563
564         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
565         tb  = inet_sk(sk)->bind_hash;
566         spin_lock_bh(&head->lock);
567         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
568                 __inet_hash(&tcp_hashinfo, sk, 0);
569                 spin_unlock_bh(&head->lock);
570                 return 0;
571         } else {
572                 spin_unlock(&head->lock);
573                 /* No definite answer... Walk to established hash table */
574                 ret = __tcp_v4_check_established(sk, snum, NULL);
575 out:
576                 local_bh_enable();
577                 return ret;
578         }
579 }
580
581 /* This will initiate an outgoing connection. */
582 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
583 {
584         struct inet_sock *inet = inet_sk(sk);
585         struct tcp_sock *tp = tcp_sk(sk);
586         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
587         struct rtable *rt;
588         u32 daddr, nexthop;
589         int tmp;
590         int err;
591
592         if (addr_len < sizeof(struct sockaddr_in))
593                 return -EINVAL;
594
595         if (usin->sin_family != AF_INET)
596                 return -EAFNOSUPPORT;
597
598         nexthop = daddr = usin->sin_addr.s_addr;
599         if (inet->opt && inet->opt->srr) {
600                 if (!daddr)
601                         return -EINVAL;
602                 nexthop = inet->opt->faddr;
603         }
604
605         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
606                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
607                                IPPROTO_TCP,
608                                inet->sport, usin->sin_port, sk);
609         if (tmp < 0)
610                 return tmp;
611
612         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
613                 ip_rt_put(rt);
614                 return -ENETUNREACH;
615         }
616
617         if (!inet->opt || !inet->opt->srr)
618                 daddr = rt->rt_dst;
619
620         if (!inet->saddr)
621                 inet->saddr = rt->rt_src;
622         inet->rcv_saddr = inet->saddr;
623
624         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
625                 /* Reset inherited state */
626                 tp->rx_opt.ts_recent       = 0;
627                 tp->rx_opt.ts_recent_stamp = 0;
628                 tp->write_seq              = 0;
629         }
630
631         if (sysctl_tcp_tw_recycle &&
632             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
633                 struct inet_peer *peer = rt_get_peer(rt);
634
635                 /* VJ's idea. We save last timestamp seen from
636                  * the destination in peer table, when entering state TIME-WAIT
637                  * and initialize rx_opt.ts_recent from it, when trying new connection.
638                  */
639
640                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
641                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
642                         tp->rx_opt.ts_recent = peer->tcp_ts;
643                 }
644         }
645
646         inet->dport = usin->sin_port;
647         inet->daddr = daddr;
648
649         tp->ext_header_len = 0;
650         if (inet->opt)
651                 tp->ext_header_len = inet->opt->optlen;
652
653         tp->rx_opt.mss_clamp = 536;
654
655         /* Socket identity is still unknown (sport may be zero).
656          * However we set state to SYN-SENT and not releasing socket
657          * lock select source port, enter ourselves into the hash tables and
658          * complete initialization after this.
659          */
660         tcp_set_state(sk, TCP_SYN_SENT);
661         err = tcp_v4_hash_connect(sk);
662         if (err)
663                 goto failure;
664
665         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
666         if (err)
667                 goto failure;
668
669         /* OK, now commit destination to socket.  */
670         sk_setup_caps(sk, &rt->u.dst);
671
672         if (!tp->write_seq)
673                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
674                                                            inet->daddr,
675                                                            inet->sport,
676                                                            usin->sin_port);
677
678         inet->id = tp->write_seq ^ jiffies;
679
680         err = tcp_connect(sk);
681         rt = NULL;
682         if (err)
683                 goto failure;
684
685         return 0;
686
687 failure:
688         /* This unhashes the socket and releases the local port, if necessary. */
689         tcp_set_state(sk, TCP_CLOSE);
690         ip_rt_put(rt);
691         sk->sk_route_caps = 0;
692         inet->dport = 0;
693         return err;
694 }
695
696 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
697 {
698         return ((struct rtable *)skb->dst)->rt_iif;
699 }
700
701 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
702 {
703         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
704 }
705
706 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
707                                               struct request_sock ***prevp,
708                                               __u16 rport,
709                                               __u32 raddr, __u32 laddr)
710 {
711         struct listen_sock *lopt = tp->accept_queue.listen_opt;
712         struct request_sock *req, **prev;
713
714         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
715              (req = *prev) != NULL;
716              prev = &req->dl_next) {
717                 const struct inet_request_sock *ireq = inet_rsk(req);
718
719                 if (ireq->rmt_port == rport &&
720                     ireq->rmt_addr == raddr &&
721                     ireq->loc_addr == laddr &&
722                     TCP_INET_FAMILY(req->rsk_ops->family)) {
723                         BUG_TRAP(!req->sk);
724                         *prevp = prev;
725                         break;
726                 }
727         }
728
729         return req;
730 }
731
732 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
733 {
734         struct tcp_sock *tp = tcp_sk(sk);
735         struct listen_sock *lopt = tp->accept_queue.listen_opt;
736         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
737
738         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
739         tcp_synq_added(sk);
740 }
741
742
743 /*
744  * This routine does path mtu discovery as defined in RFC1191.
745  */
746 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
747                                      u32 mtu)
748 {
749         struct dst_entry *dst;
750         struct inet_sock *inet = inet_sk(sk);
751         struct tcp_sock *tp = tcp_sk(sk);
752
753         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
754          * send out by Linux are always <576bytes so they should go through
755          * unfragmented).
756          */
757         if (sk->sk_state == TCP_LISTEN)
758                 return;
759
760         /* We don't check in the destentry if pmtu discovery is forbidden
761          * on this route. We just assume that no packet_to_big packets
762          * are send back when pmtu discovery is not active.
763          * There is a small race when the user changes this flag in the
764          * route, but I think that's acceptable.
765          */
766         if ((dst = __sk_dst_check(sk, 0)) == NULL)
767                 return;
768
769         dst->ops->update_pmtu(dst, mtu);
770
771         /* Something is about to be wrong... Remember soft error
772          * for the case, if this connection will not able to recover.
773          */
774         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
775                 sk->sk_err_soft = EMSGSIZE;
776
777         mtu = dst_mtu(dst);
778
779         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
780             tp->pmtu_cookie > mtu) {
781                 tcp_sync_mss(sk, mtu);
782
783                 /* Resend the TCP packet because it's
784                  * clear that the old packet has been
785                  * dropped. This is the new "fast" path mtu
786                  * discovery.
787                  */
788                 tcp_simple_retransmit(sk);
789         } /* else let the usual retransmit timer handle it */
790 }
791
792 /*
793  * This routine is called by the ICMP module when it gets some
794  * sort of error condition.  If err < 0 then the socket should
795  * be closed and the error returned to the user.  If err > 0
796  * it's just the icmp type << 8 | icmp code.  After adjustment
797  * header points to the first 8 bytes of the tcp header.  We need
798  * to find the appropriate port.
799  *
800  * The locking strategy used here is very "optimistic". When
801  * someone else accesses the socket the ICMP is just dropped
802  * and for some paths there is no check at all.
803  * A more general error queue to queue errors for later handling
804  * is probably better.
805  *
806  */
807
808 void tcp_v4_err(struct sk_buff *skb, u32 info)
809 {
810         struct iphdr *iph = (struct iphdr *)skb->data;
811         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
812         struct tcp_sock *tp;
813         struct inet_sock *inet;
814         int type = skb->h.icmph->type;
815         int code = skb->h.icmph->code;
816         struct sock *sk;
817         __u32 seq;
818         int err;
819
820         if (skb->len < (iph->ihl << 2) + 8) {
821                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
822                 return;
823         }
824
825         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
826                            th->source, tcp_v4_iif(skb));
827         if (!sk) {
828                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
829                 return;
830         }
831         if (sk->sk_state == TCP_TIME_WAIT) {
832                 tcp_tw_put((struct tcp_tw_bucket *)sk);
833                 return;
834         }
835
836         bh_lock_sock(sk);
837         /* If too many ICMPs get dropped on busy
838          * servers this needs to be solved differently.
839          */
840         if (sock_owned_by_user(sk))
841                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
842
843         if (sk->sk_state == TCP_CLOSE)
844                 goto out;
845
846         tp = tcp_sk(sk);
847         seq = ntohl(th->seq);
848         if (sk->sk_state != TCP_LISTEN &&
849             !between(seq, tp->snd_una, tp->snd_nxt)) {
850                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
851                 goto out;
852         }
853
854         switch (type) {
855         case ICMP_SOURCE_QUENCH:
856                 /* Just silently ignore these. */
857                 goto out;
858         case ICMP_PARAMETERPROB:
859                 err = EPROTO;
860                 break;
861         case ICMP_DEST_UNREACH:
862                 if (code > NR_ICMP_UNREACH)
863                         goto out;
864
865                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
866                         if (!sock_owned_by_user(sk))
867                                 do_pmtu_discovery(sk, iph, info);
868                         goto out;
869                 }
870
871                 err = icmp_err_convert[code].errno;
872                 break;
873         case ICMP_TIME_EXCEEDED:
874                 err = EHOSTUNREACH;
875                 break;
876         default:
877                 goto out;
878         }
879
880         switch (sk->sk_state) {
881                 struct request_sock *req, **prev;
882         case TCP_LISTEN:
883                 if (sock_owned_by_user(sk))
884                         goto out;
885
886                 req = tcp_v4_search_req(tp, &prev, th->dest,
887                                         iph->daddr, iph->saddr);
888                 if (!req)
889                         goto out;
890
891                 /* ICMPs are not backlogged, hence we cannot get
892                    an established socket here.
893                  */
894                 BUG_TRAP(!req->sk);
895
896                 if (seq != tcp_rsk(req)->snt_isn) {
897                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
898                         goto out;
899                 }
900
901                 /*
902                  * Still in SYN_RECV, just remove it silently.
903                  * There is no good way to pass the error to the newly
904                  * created socket, and POSIX does not want network
905                  * errors returned from accept().
906                  */
907                 tcp_synq_drop(sk, req, prev);
908                 goto out;
909
910         case TCP_SYN_SENT:
911         case TCP_SYN_RECV:  /* Cannot happen.
912                                It can f.e. if SYNs crossed.
913                              */
914                 if (!sock_owned_by_user(sk)) {
915                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
916                         sk->sk_err = err;
917
918                         sk->sk_error_report(sk);
919
920                         tcp_done(sk);
921                 } else {
922                         sk->sk_err_soft = err;
923                 }
924                 goto out;
925         }
926
927         /* If we've already connected we will keep trying
928          * until we time out, or the user gives up.
929          *
930          * rfc1122 4.2.3.9 allows to consider as hard errors
931          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
932          * but it is obsoleted by pmtu discovery).
933          *
934          * Note, that in modern internet, where routing is unreliable
935          * and in each dark corner broken firewalls sit, sending random
936          * errors ordered by their masters even this two messages finally lose
937          * their original sense (even Linux sends invalid PORT_UNREACHs)
938          *
939          * Now we are in compliance with RFCs.
940          *                                                      --ANK (980905)
941          */
942
943         inet = inet_sk(sk);
944         if (!sock_owned_by_user(sk) && inet->recverr) {
945                 sk->sk_err = err;
946                 sk->sk_error_report(sk);
947         } else  { /* Only an error on timeout */
948                 sk->sk_err_soft = err;
949         }
950
951 out:
952         bh_unlock_sock(sk);
953         sock_put(sk);
954 }
955
956 /* This routine computes an IPv4 TCP checksum. */
957 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
958                        struct sk_buff *skb)
959 {
960         struct inet_sock *inet = inet_sk(sk);
961
962         if (skb->ip_summed == CHECKSUM_HW) {
963                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
964                 skb->csum = offsetof(struct tcphdr, check);
965         } else {
966                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
967                                          csum_partial((char *)th,
968                                                       th->doff << 2,
969                                                       skb->csum));
970         }
971 }
972
973 /*
974  *      This routine will send an RST to the other tcp.
975  *
976  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
977  *                    for reset.
978  *      Answer: if a packet caused RST, it is not for a socket
979  *              existing in our system, if it is matched to a socket,
980  *              it is just duplicate segment or bug in other side's TCP.
981  *              So that we build reply only basing on parameters
982  *              arrived with segment.
983  *      Exception: precedence violation. We do not implement it in any case.
984  */
985
986 static void tcp_v4_send_reset(struct sk_buff *skb)
987 {
988         struct tcphdr *th = skb->h.th;
989         struct tcphdr rth;
990         struct ip_reply_arg arg;
991
992         /* Never send a reset in response to a reset. */
993         if (th->rst)
994                 return;
995
996         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
997                 return;
998
999         /* Swap the send and the receive. */
1000         memset(&rth, 0, sizeof(struct tcphdr));
1001         rth.dest   = th->source;
1002         rth.source = th->dest;
1003         rth.doff   = sizeof(struct tcphdr) / 4;
1004         rth.rst    = 1;
1005
1006         if (th->ack) {
1007                 rth.seq = th->ack_seq;
1008         } else {
1009                 rth.ack = 1;
1010                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1011                                     skb->len - (th->doff << 2));
1012         }
1013
1014         memset(&arg, 0, sizeof arg);
1015         arg.iov[0].iov_base = (unsigned char *)&rth;
1016         arg.iov[0].iov_len  = sizeof rth;
1017         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1018                                       skb->nh.iph->saddr, /*XXX*/
1019                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1020         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1021
1022         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1023
1024         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1025         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1026 }
1027
1028 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1029    outside socket context is ugly, certainly. What can I do?
1030  */
1031
1032 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1033                             u32 win, u32 ts)
1034 {
1035         struct tcphdr *th = skb->h.th;
1036         struct {
1037                 struct tcphdr th;
1038                 u32 tsopt[3];
1039         } rep;
1040         struct ip_reply_arg arg;
1041
1042         memset(&rep.th, 0, sizeof(struct tcphdr));
1043         memset(&arg, 0, sizeof arg);
1044
1045         arg.iov[0].iov_base = (unsigned char *)&rep;
1046         arg.iov[0].iov_len  = sizeof(rep.th);
1047         if (ts) {
1048                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1049                                      (TCPOPT_TIMESTAMP << 8) |
1050                                      TCPOLEN_TIMESTAMP);
1051                 rep.tsopt[1] = htonl(tcp_time_stamp);
1052                 rep.tsopt[2] = htonl(ts);
1053                 arg.iov[0].iov_len = sizeof(rep);
1054         }
1055
1056         /* Swap the send and the receive. */
1057         rep.th.dest    = th->source;
1058         rep.th.source  = th->dest;
1059         rep.th.doff    = arg.iov[0].iov_len / 4;
1060         rep.th.seq     = htonl(seq);
1061         rep.th.ack_seq = htonl(ack);
1062         rep.th.ack     = 1;
1063         rep.th.window  = htons(win);
1064
1065         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1066                                       skb->nh.iph->saddr, /*XXX*/
1067                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1068         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1069
1070         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1071
1072         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1073 }
1074
1075 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1076 {
1077         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1078
1079         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1080                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1081
1082         tcp_tw_put(tw);
1083 }
1084
1085 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1086 {
1087         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1088                         req->ts_recent);
1089 }
1090
1091 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1092                                           struct request_sock *req)
1093 {
1094         struct rtable *rt;
1095         const struct inet_request_sock *ireq = inet_rsk(req);
1096         struct ip_options *opt = inet_rsk(req)->opt;
1097         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1098                             .nl_u = { .ip4_u =
1099                                       { .daddr = ((opt && opt->srr) ?
1100                                                   opt->faddr :
1101                                                   ireq->rmt_addr),
1102                                         .saddr = ireq->loc_addr,
1103                                         .tos = RT_CONN_FLAGS(sk) } },
1104                             .proto = IPPROTO_TCP,
1105                             .uli_u = { .ports =
1106                                        { .sport = inet_sk(sk)->sport,
1107                                          .dport = ireq->rmt_port } } };
1108
1109         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1110                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1111                 return NULL;
1112         }
1113         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1114                 ip_rt_put(rt);
1115                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1116                 return NULL;
1117         }
1118         return &rt->u.dst;
1119 }
1120
1121 /*
1122  *      Send a SYN-ACK after having received an ACK.
1123  *      This still operates on a request_sock only, not on a big
1124  *      socket.
1125  */
1126 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1127                               struct dst_entry *dst)
1128 {
1129         const struct inet_request_sock *ireq = inet_rsk(req);
1130         int err = -1;
1131         struct sk_buff * skb;
1132
1133         /* First, grab a route. */
1134         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1135                 goto out;
1136
1137         skb = tcp_make_synack(sk, dst, req);
1138
1139         if (skb) {
1140                 struct tcphdr *th = skb->h.th;
1141
1142                 th->check = tcp_v4_check(th, skb->len,
1143                                          ireq->loc_addr,
1144                                          ireq->rmt_addr,
1145                                          csum_partial((char *)th, skb->len,
1146                                                       skb->csum));
1147
1148                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1149                                             ireq->rmt_addr,
1150                                             ireq->opt);
1151                 if (err == NET_XMIT_CN)
1152                         err = 0;
1153         }
1154
1155 out:
1156         dst_release(dst);
1157         return err;
1158 }
1159
1160 /*
1161  *      IPv4 request_sock destructor.
1162  */
1163 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1164 {
1165         if (inet_rsk(req)->opt)
1166                 kfree(inet_rsk(req)->opt);
1167 }
1168
1169 static inline void syn_flood_warning(struct sk_buff *skb)
1170 {
1171         static unsigned long warntime;
1172
1173         if (time_after(jiffies, (warntime + HZ * 60))) {
1174                 warntime = jiffies;
1175                 printk(KERN_INFO
1176                        "possible SYN flooding on port %d. Sending cookies.\n",
1177                        ntohs(skb->h.th->dest));
1178         }
1179 }
1180
1181 /*
1182  * Save and compile IPv4 options into the request_sock if needed.
1183  */
1184 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1185                                                      struct sk_buff *skb)
1186 {
1187         struct ip_options *opt = &(IPCB(skb)->opt);
1188         struct ip_options *dopt = NULL;
1189
1190         if (opt && opt->optlen) {
1191                 int opt_size = optlength(opt);
1192                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1193                 if (dopt) {
1194                         if (ip_options_echo(dopt, skb)) {
1195                                 kfree(dopt);
1196                                 dopt = NULL;
1197                         }
1198                 }
1199         }
1200         return dopt;
1201 }
1202
1203 struct request_sock_ops tcp_request_sock_ops = {
1204         .family         =       PF_INET,
1205         .obj_size       =       sizeof(struct tcp_request_sock),
1206         .rtx_syn_ack    =       tcp_v4_send_synack,
1207         .send_ack       =       tcp_v4_reqsk_send_ack,
1208         .destructor     =       tcp_v4_reqsk_destructor,
1209         .send_reset     =       tcp_v4_send_reset,
1210 };
1211
1212 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1213 {
1214         struct inet_request_sock *ireq;
1215         struct tcp_options_received tmp_opt;
1216         struct request_sock *req;
1217         __u32 saddr = skb->nh.iph->saddr;
1218         __u32 daddr = skb->nh.iph->daddr;
1219         __u32 isn = TCP_SKB_CB(skb)->when;
1220         struct dst_entry *dst = NULL;
1221 #ifdef CONFIG_SYN_COOKIES
1222         int want_cookie = 0;
1223 #else
1224 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1225 #endif
1226
1227         /* Never answer to SYNs send to broadcast or multicast */
1228         if (((struct rtable *)skb->dst)->rt_flags &
1229             (RTCF_BROADCAST | RTCF_MULTICAST))
1230                 goto drop;
1231
1232         /* TW buckets are converted to open requests without
1233          * limitations, they conserve resources and peer is
1234          * evidently real one.
1235          */
1236         if (tcp_synq_is_full(sk) && !isn) {
1237 #ifdef CONFIG_SYN_COOKIES
1238                 if (sysctl_tcp_syncookies) {
1239                         want_cookie = 1;
1240                 } else
1241 #endif
1242                 goto drop;
1243         }
1244
1245         /* Accept backlog is full. If we have already queued enough
1246          * of warm entries in syn queue, drop request. It is better than
1247          * clogging syn queue with openreqs with exponentially increasing
1248          * timeout.
1249          */
1250         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1251                 goto drop;
1252
1253         req = reqsk_alloc(&tcp_request_sock_ops);
1254         if (!req)
1255                 goto drop;
1256
1257         tcp_clear_options(&tmp_opt);
1258         tmp_opt.mss_clamp = 536;
1259         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1260
1261         tcp_parse_options(skb, &tmp_opt, 0);
1262
1263         if (want_cookie) {
1264                 tcp_clear_options(&tmp_opt);
1265                 tmp_opt.saw_tstamp = 0;
1266         }
1267
1268         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1269                 /* Some OSes (unknown ones, but I see them on web server, which
1270                  * contains information interesting only for windows'
1271                  * users) do not send their stamp in SYN. It is easy case.
1272                  * We simply do not advertise TS support.
1273                  */
1274                 tmp_opt.saw_tstamp = 0;
1275                 tmp_opt.tstamp_ok  = 0;
1276         }
1277         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1278
1279         tcp_openreq_init(req, &tmp_opt, skb);
1280
1281         ireq = inet_rsk(req);
1282         ireq->loc_addr = daddr;
1283         ireq->rmt_addr = saddr;
1284         ireq->opt = tcp_v4_save_options(sk, skb);
1285         if (!want_cookie)
1286                 TCP_ECN_create_request(req, skb->h.th);
1287
1288         if (want_cookie) {
1289 #ifdef CONFIG_SYN_COOKIES
1290                 syn_flood_warning(skb);
1291 #endif
1292                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1293         } else if (!isn) {
1294                 struct inet_peer *peer = NULL;
1295
1296                 /* VJ's idea. We save last timestamp seen
1297                  * from the destination in peer table, when entering
1298                  * state TIME-WAIT, and check against it before
1299                  * accepting new connection request.
1300                  *
1301                  * If "isn" is not zero, this request hit alive
1302                  * timewait bucket, so that all the necessary checks
1303                  * are made in the function processing timewait state.
1304                  */
1305                 if (tmp_opt.saw_tstamp &&
1306                     sysctl_tcp_tw_recycle &&
1307                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1308                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1309                     peer->v4daddr == saddr) {
1310                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1311                             (s32)(peer->tcp_ts - req->ts_recent) >
1312                                                         TCP_PAWS_WINDOW) {
1313                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1314                                 dst_release(dst);
1315                                 goto drop_and_free;
1316                         }
1317                 }
1318                 /* Kill the following clause, if you dislike this way. */
1319                 else if (!sysctl_tcp_syncookies &&
1320                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1321                           (sysctl_max_syn_backlog >> 2)) &&
1322                          (!peer || !peer->tcp_ts_stamp) &&
1323                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1324                         /* Without syncookies last quarter of
1325                          * backlog is filled with destinations,
1326                          * proven to be alive.
1327                          * It means that we continue to communicate
1328                          * to destinations, already remembered
1329                          * to the moment of synflood.
1330                          */
1331                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1332                                               "request from %u.%u."
1333                                               "%u.%u/%u\n",
1334                                               NIPQUAD(saddr),
1335                                               ntohs(skb->h.th->source)));
1336                         dst_release(dst);
1337                         goto drop_and_free;
1338                 }
1339
1340                 isn = tcp_v4_init_sequence(sk, skb);
1341         }
1342         tcp_rsk(req)->snt_isn = isn;
1343
1344         if (tcp_v4_send_synack(sk, req, dst))
1345                 goto drop_and_free;
1346
1347         if (want_cookie) {
1348                 reqsk_free(req);
1349         } else {
1350                 tcp_v4_synq_add(sk, req);
1351         }
1352         return 0;
1353
1354 drop_and_free:
1355         reqsk_free(req);
1356 drop:
1357         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1358         return 0;
1359 }
1360
1361
1362 /*
1363  * The three way handshake has completed - we got a valid synack -
1364  * now create the new socket.
1365  */
1366 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1367                                   struct request_sock *req,
1368                                   struct dst_entry *dst)
1369 {
1370         struct inet_request_sock *ireq;
1371         struct inet_sock *newinet;
1372         struct tcp_sock *newtp;
1373         struct sock *newsk;
1374
1375         if (sk_acceptq_is_full(sk))
1376                 goto exit_overflow;
1377
1378         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1379                 goto exit;
1380
1381         newsk = tcp_create_openreq_child(sk, req, skb);
1382         if (!newsk)
1383                 goto exit;
1384
1385         sk_setup_caps(newsk, dst);
1386
1387         newtp                 = tcp_sk(newsk);
1388         newinet               = inet_sk(newsk);
1389         ireq                  = inet_rsk(req);
1390         newinet->daddr        = ireq->rmt_addr;
1391         newinet->rcv_saddr    = ireq->loc_addr;
1392         newinet->saddr        = ireq->loc_addr;
1393         newinet->opt          = ireq->opt;
1394         ireq->opt             = NULL;
1395         newinet->mc_index     = tcp_v4_iif(skb);
1396         newinet->mc_ttl       = skb->nh.iph->ttl;
1397         newtp->ext_header_len = 0;
1398         if (newinet->opt)
1399                 newtp->ext_header_len = newinet->opt->optlen;
1400         newinet->id = newtp->write_seq ^ jiffies;
1401
1402         tcp_sync_mss(newsk, dst_mtu(dst));
1403         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1404         tcp_initialize_rcv_mss(newsk);
1405
1406         __inet_hash(&tcp_hashinfo, newsk, 0);
1407         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1408
1409         return newsk;
1410
1411 exit_overflow:
1412         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1413 exit:
1414         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1415         dst_release(dst);
1416         return NULL;
1417 }
1418
1419 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1420 {
1421         struct tcphdr *th = skb->h.th;
1422         struct iphdr *iph = skb->nh.iph;
1423         struct tcp_sock *tp = tcp_sk(sk);
1424         struct sock *nsk;
1425         struct request_sock **prev;
1426         /* Find possible connection requests. */
1427         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1428                                                      iph->saddr, iph->daddr);
1429         if (req)
1430                 return tcp_check_req(sk, skb, req, prev);
1431
1432         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1433                                           th->source,
1434                                           skb->nh.iph->daddr,
1435                                           ntohs(th->dest),
1436                                           tcp_v4_iif(skb));
1437
1438         if (nsk) {
1439                 if (nsk->sk_state != TCP_TIME_WAIT) {
1440                         bh_lock_sock(nsk);
1441                         return nsk;
1442                 }
1443                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1444                 return NULL;
1445         }
1446
1447 #ifdef CONFIG_SYN_COOKIES
1448         if (!th->rst && !th->syn && th->ack)
1449                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1450 #endif
1451         return sk;
1452 }
1453
1454 static int tcp_v4_checksum_init(struct sk_buff *skb)
1455 {
1456         if (skb->ip_summed == CHECKSUM_HW) {
1457                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1458                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1459                                   skb->nh.iph->daddr, skb->csum))
1460                         return 0;
1461
1462                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1463                 skb->ip_summed = CHECKSUM_NONE;
1464         }
1465         if (skb->len <= 76) {
1466                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1467                                  skb->nh.iph->daddr,
1468                                  skb_checksum(skb, 0, skb->len, 0)))
1469                         return -1;
1470                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1471         } else {
1472                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1473                                           skb->nh.iph->saddr,
1474                                           skb->nh.iph->daddr, 0);
1475         }
1476         return 0;
1477 }
1478
1479
1480 /* The socket must have it's spinlock held when we get
1481  * here.
1482  *
1483  * We have a potential double-lock case here, so even when
1484  * doing backlog processing we use the BH locking scheme.
1485  * This is because we cannot sleep with the original spinlock
1486  * held.
1487  */
1488 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1489 {
1490         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1491                 TCP_CHECK_TIMER(sk);
1492                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1493                         goto reset;
1494                 TCP_CHECK_TIMER(sk);
1495                 return 0;
1496         }
1497
1498         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1499                 goto csum_err;
1500
1501         if (sk->sk_state == TCP_LISTEN) {
1502                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1503                 if (!nsk)
1504                         goto discard;
1505
1506                 if (nsk != sk) {
1507                         if (tcp_child_process(sk, nsk, skb))
1508                                 goto reset;
1509                         return 0;
1510                 }
1511         }
1512
1513         TCP_CHECK_TIMER(sk);
1514         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1515                 goto reset;
1516         TCP_CHECK_TIMER(sk);
1517         return 0;
1518
1519 reset:
1520         tcp_v4_send_reset(skb);
1521 discard:
1522         kfree_skb(skb);
1523         /* Be careful here. If this function gets more complicated and
1524          * gcc suffers from register pressure on the x86, sk (in %ebx)
1525          * might be destroyed here. This current version compiles correctly,
1526          * but you have been warned.
1527          */
1528         return 0;
1529
1530 csum_err:
1531         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1532         goto discard;
1533 }
1534
1535 /*
1536  *      From tcp_input.c
1537  */
1538
1539 int tcp_v4_rcv(struct sk_buff *skb)
1540 {
1541         struct tcphdr *th;
1542         struct sock *sk;
1543         int ret;
1544
1545         if (skb->pkt_type != PACKET_HOST)
1546                 goto discard_it;
1547
1548         /* Count it even if it's bad */
1549         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1550
1551         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1552                 goto discard_it;
1553
1554         th = skb->h.th;
1555
1556         if (th->doff < sizeof(struct tcphdr) / 4)
1557                 goto bad_packet;
1558         if (!pskb_may_pull(skb, th->doff * 4))
1559                 goto discard_it;
1560
1561         /* An explanation is required here, I think.
1562          * Packet length and doff are validated by header prediction,
1563          * provided case of th->doff==0 is elimineted.
1564          * So, we defer the checks. */
1565         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1566              tcp_v4_checksum_init(skb) < 0))
1567                 goto bad_packet;
1568
1569         th = skb->h.th;
1570         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1571         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1572                                     skb->len - th->doff * 4);
1573         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1574         TCP_SKB_CB(skb)->when    = 0;
1575         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1576         TCP_SKB_CB(skb)->sacked  = 0;
1577
1578         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1579                              skb->nh.iph->daddr, ntohs(th->dest),
1580                              tcp_v4_iif(skb));
1581
1582         if (!sk)
1583                 goto no_tcp_socket;
1584
1585 process:
1586         if (sk->sk_state == TCP_TIME_WAIT)
1587                 goto do_time_wait;
1588
1589         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1590                 goto discard_and_relse;
1591
1592         if (sk_filter(sk, skb, 0))
1593                 goto discard_and_relse;
1594
1595         skb->dev = NULL;
1596
1597         bh_lock_sock(sk);
1598         ret = 0;
1599         if (!sock_owned_by_user(sk)) {
1600                 if (!tcp_prequeue(sk, skb))
1601                         ret = tcp_v4_do_rcv(sk, skb);
1602         } else
1603                 sk_add_backlog(sk, skb);
1604         bh_unlock_sock(sk);
1605
1606         sock_put(sk);
1607
1608         return ret;
1609
1610 no_tcp_socket:
1611         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1612                 goto discard_it;
1613
1614         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1615 bad_packet:
1616                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1617         } else {
1618                 tcp_v4_send_reset(skb);
1619         }
1620
1621 discard_it:
1622         /* Discard frame. */
1623         kfree_skb(skb);
1624         return 0;
1625
1626 discard_and_relse:
1627         sock_put(sk);
1628         goto discard_it;
1629
1630 do_time_wait:
1631         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1632                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1633                 goto discard_it;
1634         }
1635
1636         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1637                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1638                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1639                 goto discard_it;
1640         }
1641         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1642                                            skb, th, skb->len)) {
1643         case TCP_TW_SYN: {
1644                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1645                                                           ntohs(th->dest),
1646                                                           tcp_v4_iif(skb));
1647                 if (sk2) {
1648                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1649                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1650                         sk = sk2;
1651                         goto process;
1652                 }
1653                 /* Fall through to ACK */
1654         }
1655         case TCP_TW_ACK:
1656                 tcp_v4_timewait_ack(sk, skb);
1657                 break;
1658         case TCP_TW_RST:
1659                 goto no_tcp_socket;
1660         case TCP_TW_SUCCESS:;
1661         }
1662         goto discard_it;
1663 }
1664
1665 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1666 {
1667         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1668         struct inet_sock *inet = inet_sk(sk);
1669
1670         sin->sin_family         = AF_INET;
1671         sin->sin_addr.s_addr    = inet->daddr;
1672         sin->sin_port           = inet->dport;
1673 }
1674
1675 /* VJ's idea. Save last timestamp seen from this destination
1676  * and hold it at least for normal timewait interval to use for duplicate
1677  * segment detection in subsequent connections, before they enter synchronized
1678  * state.
1679  */
1680
1681 int tcp_v4_remember_stamp(struct sock *sk)
1682 {
1683         struct inet_sock *inet = inet_sk(sk);
1684         struct tcp_sock *tp = tcp_sk(sk);
1685         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1686         struct inet_peer *peer = NULL;
1687         int release_it = 0;
1688
1689         if (!rt || rt->rt_dst != inet->daddr) {
1690                 peer = inet_getpeer(inet->daddr, 1);
1691                 release_it = 1;
1692         } else {
1693                 if (!rt->peer)
1694                         rt_bind_peer(rt, 1);
1695                 peer = rt->peer;
1696         }
1697
1698         if (peer) {
1699                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1700                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1701                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1702                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1703                         peer->tcp_ts = tp->rx_opt.ts_recent;
1704                 }
1705                 if (release_it)
1706                         inet_putpeer(peer);
1707                 return 1;
1708         }
1709
1710         return 0;
1711 }
1712
1713 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1714 {
1715         struct inet_peer *peer = NULL;
1716
1717         peer = inet_getpeer(tw->tw_daddr, 1);
1718
1719         if (peer) {
1720                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1721                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1722                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1723                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1724                         peer->tcp_ts = tw->tw_ts_recent;
1725                 }
1726                 inet_putpeer(peer);
1727                 return 1;
1728         }
1729
1730         return 0;
1731 }
1732
1733 struct tcp_func ipv4_specific = {
1734         .queue_xmit     =       ip_queue_xmit,
1735         .send_check     =       tcp_v4_send_check,
1736         .rebuild_header =       inet_sk_rebuild_header,
1737         .conn_request   =       tcp_v4_conn_request,
1738         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1739         .remember_stamp =       tcp_v4_remember_stamp,
1740         .net_header_len =       sizeof(struct iphdr),
1741         .setsockopt     =       ip_setsockopt,
1742         .getsockopt     =       ip_getsockopt,
1743         .addr2sockaddr  =       v4_addr2sockaddr,
1744         .sockaddr_len   =       sizeof(struct sockaddr_in),
1745 };
1746
1747 /* NOTE: A lot of things set to zero explicitly by call to
1748  *       sk_alloc() so need not be done here.
1749  */
1750 static int tcp_v4_init_sock(struct sock *sk)
1751 {
1752         struct tcp_sock *tp = tcp_sk(sk);
1753
1754         skb_queue_head_init(&tp->out_of_order_queue);
1755         tcp_init_xmit_timers(sk);
1756         tcp_prequeue_init(tp);
1757
1758         tp->rto  = TCP_TIMEOUT_INIT;
1759         tp->mdev = TCP_TIMEOUT_INIT;
1760
1761         /* So many TCP implementations out there (incorrectly) count the
1762          * initial SYN frame in their delayed-ACK and congestion control
1763          * algorithms that we must have the following bandaid to talk
1764          * efficiently to them.  -DaveM
1765          */
1766         tp->snd_cwnd = 2;
1767
1768         /* See draft-stevens-tcpca-spec-01 for discussion of the
1769          * initialization of these values.
1770          */
1771         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1772         tp->snd_cwnd_clamp = ~0;
1773         tp->mss_cache = 536;
1774
1775         tp->reordering = sysctl_tcp_reordering;
1776         tp->ca_ops = &tcp_init_congestion_ops;
1777
1778         sk->sk_state = TCP_CLOSE;
1779
1780         sk->sk_write_space = sk_stream_write_space;
1781         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1782
1783         tp->af_specific = &ipv4_specific;
1784
1785         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1786         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1787
1788         atomic_inc(&tcp_sockets_allocated);
1789
1790         return 0;
1791 }
1792
1793 int tcp_v4_destroy_sock(struct sock *sk)
1794 {
1795         struct tcp_sock *tp = tcp_sk(sk);
1796
1797         tcp_clear_xmit_timers(sk);
1798
1799         tcp_cleanup_congestion_control(tp);
1800
1801         /* Cleanup up the write buffer. */
1802         sk_stream_writequeue_purge(sk);
1803
1804         /* Cleans up our, hopefully empty, out_of_order_queue. */
1805         __skb_queue_purge(&tp->out_of_order_queue);
1806
1807         /* Clean prequeue, it must be empty really */
1808         __skb_queue_purge(&tp->ucopy.prequeue);
1809
1810         /* Clean up a referenced TCP bind bucket. */
1811         if (inet_sk(sk)->bind_hash)
1812                 inet_put_port(&tcp_hashinfo, sk);
1813
1814         /*
1815          * If sendmsg cached page exists, toss it.
1816          */
1817         if (sk->sk_sndmsg_page) {
1818                 __free_page(sk->sk_sndmsg_page);
1819                 sk->sk_sndmsg_page = NULL;
1820         }
1821
1822         atomic_dec(&tcp_sockets_allocated);
1823
1824         return 0;
1825 }
1826
1827 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1828
1829 #ifdef CONFIG_PROC_FS
1830 /* Proc filesystem TCP sock list dumping. */
1831
1832 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1833 {
1834         return hlist_empty(head) ? NULL :
1835                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1836 }
1837
1838 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1839 {
1840         return tw->tw_node.next ?
1841                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1842 }
1843
1844 static void *listening_get_next(struct seq_file *seq, void *cur)
1845 {
1846         struct tcp_sock *tp;
1847         struct hlist_node *node;
1848         struct sock *sk = cur;
1849         struct tcp_iter_state* st = seq->private;
1850
1851         if (!sk) {
1852                 st->bucket = 0;
1853                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1854                 goto get_sk;
1855         }
1856
1857         ++st->num;
1858
1859         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1860                 struct request_sock *req = cur;
1861
1862                 tp = tcp_sk(st->syn_wait_sk);
1863                 req = req->dl_next;
1864                 while (1) {
1865                         while (req) {
1866                                 if (req->rsk_ops->family == st->family) {
1867                                         cur = req;
1868                                         goto out;
1869                                 }
1870                                 req = req->dl_next;
1871                         }
1872                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1873                                 break;
1874 get_req:
1875                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1876                 }
1877                 sk        = sk_next(st->syn_wait_sk);
1878                 st->state = TCP_SEQ_STATE_LISTENING;
1879                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1880         } else {
1881                 tp = tcp_sk(sk);
1882                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1883                 if (reqsk_queue_len(&tp->accept_queue))
1884                         goto start_req;
1885                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1886                 sk = sk_next(sk);
1887         }
1888 get_sk:
1889         sk_for_each_from(sk, node) {
1890                 if (sk->sk_family == st->family) {
1891                         cur = sk;
1892                         goto out;
1893                 }
1894                 tp = tcp_sk(sk);
1895                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1896                 if (reqsk_queue_len(&tp->accept_queue)) {
1897 start_req:
1898                         st->uid         = sock_i_uid(sk);
1899                         st->syn_wait_sk = sk;
1900                         st->state       = TCP_SEQ_STATE_OPENREQ;
1901                         st->sbucket     = 0;
1902                         goto get_req;
1903                 }
1904                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1905         }
1906         if (++st->bucket < INET_LHTABLE_SIZE) {
1907                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1908                 goto get_sk;
1909         }
1910         cur = NULL;
1911 out:
1912         return cur;
1913 }
1914
1915 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1916 {
1917         void *rc = listening_get_next(seq, NULL);
1918
1919         while (rc && *pos) {
1920                 rc = listening_get_next(seq, rc);
1921                 --*pos;
1922         }
1923         return rc;
1924 }
1925
1926 static void *established_get_first(struct seq_file *seq)
1927 {
1928         struct tcp_iter_state* st = seq->private;
1929         void *rc = NULL;
1930
1931         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1932                 struct sock *sk;
1933                 struct hlist_node *node;
1934                 struct tcp_tw_bucket *tw;
1935
1936                 /* We can reschedule _before_ having picked the target: */
1937                 cond_resched_softirq();
1938
1939                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1940                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1941                         if (sk->sk_family != st->family) {
1942                                 continue;
1943                         }
1944                         rc = sk;
1945                         goto out;
1946                 }
1947                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1948                 tw_for_each(tw, node,
1949                             &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1950                         if (tw->tw_family != st->family) {
1951                                 continue;
1952                         }
1953                         rc = tw;
1954                         goto out;
1955                 }
1956                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1957                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1958         }
1959 out:
1960         return rc;
1961 }
1962
1963 static void *established_get_next(struct seq_file *seq, void *cur)
1964 {
1965         struct sock *sk = cur;
1966         struct tcp_tw_bucket *tw;
1967         struct hlist_node *node;
1968         struct tcp_iter_state* st = seq->private;
1969
1970         ++st->num;
1971
1972         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1973                 tw = cur;
1974                 tw = tw_next(tw);
1975 get_tw:
1976                 while (tw && tw->tw_family != st->family) {
1977                         tw = tw_next(tw);
1978                 }
1979                 if (tw) {
1980                         cur = tw;
1981                         goto out;
1982                 }
1983                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1984                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1985
1986                 /* We can reschedule between buckets: */
1987                 cond_resched_softirq();
1988
1989                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1990                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1991                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1992                 } else {
1993                         cur = NULL;
1994                         goto out;
1995                 }
1996         } else
1997                 sk = sk_next(sk);
1998
1999         sk_for_each_from(sk, node) {
2000                 if (sk->sk_family == st->family)
2001                         goto found;
2002         }
2003
2004         st->state = TCP_SEQ_STATE_TIME_WAIT;
2005         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2006         goto get_tw;
2007 found:
2008         cur = sk;
2009 out:
2010         return cur;
2011 }
2012
2013 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2014 {
2015         void *rc = established_get_first(seq);
2016
2017         while (rc && pos) {
2018                 rc = established_get_next(seq, rc);
2019                 --pos;
2020         }               
2021         return rc;
2022 }
2023
2024 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2025 {
2026         void *rc;
2027         struct tcp_iter_state* st = seq->private;
2028
2029         inet_listen_lock(&tcp_hashinfo);
2030         st->state = TCP_SEQ_STATE_LISTENING;
2031         rc        = listening_get_idx(seq, &pos);
2032
2033         if (!rc) {
2034                 inet_listen_unlock(&tcp_hashinfo);
2035                 local_bh_disable();
2036                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2037                 rc        = established_get_idx(seq, pos);
2038         }
2039
2040         return rc;
2041 }
2042
2043 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2044 {
2045         struct tcp_iter_state* st = seq->private;
2046         st->state = TCP_SEQ_STATE_LISTENING;
2047         st->num = 0;
2048         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2049 }
2050
2051 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2052 {
2053         void *rc = NULL;
2054         struct tcp_iter_state* st;
2055
2056         if (v == SEQ_START_TOKEN) {
2057                 rc = tcp_get_idx(seq, 0);
2058                 goto out;
2059         }
2060         st = seq->private;
2061
2062         switch (st->state) {
2063         case TCP_SEQ_STATE_OPENREQ:
2064         case TCP_SEQ_STATE_LISTENING:
2065                 rc = listening_get_next(seq, v);
2066                 if (!rc) {
2067                         inet_listen_unlock(&tcp_hashinfo);
2068                         local_bh_disable();
2069                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2070                         rc        = established_get_first(seq);
2071                 }
2072                 break;
2073         case TCP_SEQ_STATE_ESTABLISHED:
2074         case TCP_SEQ_STATE_TIME_WAIT:
2075                 rc = established_get_next(seq, v);
2076                 break;
2077         }
2078 out:
2079         ++*pos;
2080         return rc;
2081 }
2082
2083 static void tcp_seq_stop(struct seq_file *seq, void *v)
2084 {
2085         struct tcp_iter_state* st = seq->private;
2086
2087         switch (st->state) {
2088         case TCP_SEQ_STATE_OPENREQ:
2089                 if (v) {
2090                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2091                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2092                 }
2093         case TCP_SEQ_STATE_LISTENING:
2094                 if (v != SEQ_START_TOKEN)
2095                         inet_listen_unlock(&tcp_hashinfo);
2096                 break;
2097         case TCP_SEQ_STATE_TIME_WAIT:
2098         case TCP_SEQ_STATE_ESTABLISHED:
2099                 if (v)
2100                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2101                 local_bh_enable();
2102                 break;
2103         }
2104 }
2105
2106 static int tcp_seq_open(struct inode *inode, struct file *file)
2107 {
2108         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2109         struct seq_file *seq;
2110         struct tcp_iter_state *s;
2111         int rc;
2112
2113         if (unlikely(afinfo == NULL))
2114                 return -EINVAL;
2115
2116         s = kmalloc(sizeof(*s), GFP_KERNEL);
2117         if (!s)
2118                 return -ENOMEM;
2119         memset(s, 0, sizeof(*s));
2120         s->family               = afinfo->family;
2121         s->seq_ops.start        = tcp_seq_start;
2122         s->seq_ops.next         = tcp_seq_next;
2123         s->seq_ops.show         = afinfo->seq_show;
2124         s->seq_ops.stop         = tcp_seq_stop;
2125
2126         rc = seq_open(file, &s->seq_ops);
2127         if (rc)
2128                 goto out_kfree;
2129         seq          = file->private_data;
2130         seq->private = s;
2131 out:
2132         return rc;
2133 out_kfree:
2134         kfree(s);
2135         goto out;
2136 }
2137
2138 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2139 {
2140         int rc = 0;
2141         struct proc_dir_entry *p;
2142
2143         if (!afinfo)
2144                 return -EINVAL;
2145         afinfo->seq_fops->owner         = afinfo->owner;
2146         afinfo->seq_fops->open          = tcp_seq_open;
2147         afinfo->seq_fops->read          = seq_read;
2148         afinfo->seq_fops->llseek        = seq_lseek;
2149         afinfo->seq_fops->release       = seq_release_private;
2150         
2151         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2152         if (p)
2153                 p->data = afinfo;
2154         else
2155                 rc = -ENOMEM;
2156         return rc;
2157 }
2158
2159 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2160 {
2161         if (!afinfo)
2162                 return;
2163         proc_net_remove(afinfo->name);
2164         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2165 }
2166
2167 static void get_openreq4(struct sock *sk, struct request_sock *req,
2168                          char *tmpbuf, int i, int uid)
2169 {
2170         const struct inet_request_sock *ireq = inet_rsk(req);
2171         int ttd = req->expires - jiffies;
2172
2173         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2174                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2175                 i,
2176                 ireq->loc_addr,
2177                 ntohs(inet_sk(sk)->sport),
2178                 ireq->rmt_addr,
2179                 ntohs(ireq->rmt_port),
2180                 TCP_SYN_RECV,
2181                 0, 0, /* could print option size, but that is af dependent. */
2182                 1,    /* timers active (only the expire timer) */
2183                 jiffies_to_clock_t(ttd),
2184                 req->retrans,
2185                 uid,
2186                 0,  /* non standard timer */
2187                 0, /* open_requests have no inode */
2188                 atomic_read(&sk->sk_refcnt),
2189                 req);
2190 }
2191
2192 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2193 {
2194         int timer_active;
2195         unsigned long timer_expires;
2196         struct tcp_sock *tp = tcp_sk(sp);
2197         struct inet_sock *inet = inet_sk(sp);
2198         unsigned int dest = inet->daddr;
2199         unsigned int src = inet->rcv_saddr;
2200         __u16 destp = ntohs(inet->dport);
2201         __u16 srcp = ntohs(inet->sport);
2202
2203         if (tp->pending == TCP_TIME_RETRANS) {
2204                 timer_active    = 1;
2205                 timer_expires   = tp->timeout;
2206         } else if (tp->pending == TCP_TIME_PROBE0) {
2207                 timer_active    = 4;
2208                 timer_expires   = tp->timeout;
2209         } else if (timer_pending(&sp->sk_timer)) {
2210                 timer_active    = 2;
2211                 timer_expires   = sp->sk_timer.expires;
2212         } else {
2213                 timer_active    = 0;
2214                 timer_expires = jiffies;
2215         }
2216
2217         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2218                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2219                 i, src, srcp, dest, destp, sp->sk_state,
2220                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2221                 timer_active,
2222                 jiffies_to_clock_t(timer_expires - jiffies),
2223                 tp->retransmits,
2224                 sock_i_uid(sp),
2225                 tp->probes_out,
2226                 sock_i_ino(sp),
2227                 atomic_read(&sp->sk_refcnt), sp,
2228                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2229                 tp->snd_cwnd,
2230                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2231 }
2232
2233 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2234 {
2235         unsigned int dest, src;
2236         __u16 destp, srcp;
2237         int ttd = tw->tw_ttd - jiffies;
2238
2239         if (ttd < 0)
2240                 ttd = 0;
2241
2242         dest  = tw->tw_daddr;
2243         src   = tw->tw_rcv_saddr;
2244         destp = ntohs(tw->tw_dport);
2245         srcp  = ntohs(tw->tw_sport);
2246
2247         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2248                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2249                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2250                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2251                 atomic_read(&tw->tw_refcnt), tw);
2252 }
2253
2254 #define TMPSZ 150
2255
2256 static int tcp4_seq_show(struct seq_file *seq, void *v)
2257 {
2258         struct tcp_iter_state* st;
2259         char tmpbuf[TMPSZ + 1];
2260
2261         if (v == SEQ_START_TOKEN) {
2262                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2263                            "  sl  local_address rem_address   st tx_queue "
2264                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2265                            "inode");
2266                 goto out;
2267         }
2268         st = seq->private;
2269
2270         switch (st->state) {
2271         case TCP_SEQ_STATE_LISTENING:
2272         case TCP_SEQ_STATE_ESTABLISHED:
2273                 get_tcp4_sock(v, tmpbuf, st->num);
2274                 break;
2275         case TCP_SEQ_STATE_OPENREQ:
2276                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2277                 break;
2278         case TCP_SEQ_STATE_TIME_WAIT:
2279                 get_timewait4_sock(v, tmpbuf, st->num);
2280                 break;
2281         }
2282         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2283 out:
2284         return 0;
2285 }
2286
2287 static struct file_operations tcp4_seq_fops;
2288 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2289         .owner          = THIS_MODULE,
2290         .name           = "tcp",
2291         .family         = AF_INET,
2292         .seq_show       = tcp4_seq_show,
2293         .seq_fops       = &tcp4_seq_fops,
2294 };
2295
2296 int __init tcp4_proc_init(void)
2297 {
2298         return tcp_proc_register(&tcp4_seq_afinfo);
2299 }
2300
2301 void tcp4_proc_exit(void)
2302 {
2303         tcp_proc_unregister(&tcp4_seq_afinfo);
2304 }
2305 #endif /* CONFIG_PROC_FS */
2306
2307 struct proto tcp_prot = {
2308         .name                   = "TCP",
2309         .owner                  = THIS_MODULE,
2310         .close                  = tcp_close,
2311         .connect                = tcp_v4_connect,
2312         .disconnect             = tcp_disconnect,
2313         .accept                 = tcp_accept,
2314         .ioctl                  = tcp_ioctl,
2315         .init                   = tcp_v4_init_sock,
2316         .destroy                = tcp_v4_destroy_sock,
2317         .shutdown               = tcp_shutdown,
2318         .setsockopt             = tcp_setsockopt,
2319         .getsockopt             = tcp_getsockopt,
2320         .sendmsg                = tcp_sendmsg,
2321         .recvmsg                = tcp_recvmsg,
2322         .backlog_rcv            = tcp_v4_do_rcv,
2323         .hash                   = tcp_v4_hash,
2324         .unhash                 = tcp_unhash,
2325         .get_port               = tcp_v4_get_port,
2326         .enter_memory_pressure  = tcp_enter_memory_pressure,
2327         .sockets_allocated      = &tcp_sockets_allocated,
2328         .memory_allocated       = &tcp_memory_allocated,
2329         .memory_pressure        = &tcp_memory_pressure,
2330         .sysctl_mem             = sysctl_tcp_mem,
2331         .sysctl_wmem            = sysctl_tcp_wmem,
2332         .sysctl_rmem            = sysctl_tcp_rmem,
2333         .max_header             = MAX_TCP_HEADER,
2334         .obj_size               = sizeof(struct tcp_sock),
2335         .rsk_prot               = &tcp_request_sock_ops,
2336 };
2337
2338
2339
2340 void __init tcp_v4_init(struct net_proto_family *ops)
2341 {
2342         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2343         if (err < 0)
2344                 panic("Failed to create the TCP control socket.\n");
2345         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2346         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2347
2348         /* Unhash it so that IP input processing does not even
2349          * see it, we do not wish this socket to see incoming
2350          * packets.
2351          */
2352         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2353 }
2354
2355 EXPORT_SYMBOL(ipv4_specific);
2356 EXPORT_SYMBOL(inet_bind_bucket_create);
2357 EXPORT_SYMBOL(tcp_hashinfo);
2358 EXPORT_SYMBOL(tcp_prot);
2359 EXPORT_SYMBOL(tcp_unhash);
2360 EXPORT_SYMBOL(tcp_v4_conn_request);
2361 EXPORT_SYMBOL(tcp_v4_connect);
2362 EXPORT_SYMBOL(tcp_v4_do_rcv);
2363 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2364 EXPORT_SYMBOL(tcp_v4_send_check);
2365 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2366
2367 #ifdef CONFIG_PROC_FS
2368 EXPORT_SYMBOL(tcp_proc_register);
2369 EXPORT_SYMBOL(tcp_proc_unregister);
2370 #endif
2371 EXPORT_SYMBOL(sysctl_local_port_range);
2372 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2373 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2374