port more changes to make PCI work
[linux-2.4.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.237.2.1 2002/01/15 08:49:49 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an ACK bit.
36  *              Andi Kleen :            Implemented fast path mtu discovery.
37  *                                      Fixed many serious bugs in the
38  *                                      open_request handling and moved
39  *                                      most of it into the af independent code.
40  *                                      Added tail drop and some other bugfixes.
41  *                                      Added new listen sematics.
42  *              Mike McLagan    :       Routing by source
43  *      Juan Jose Ciarlante:            ip_dynaddr bits
44  *              Andi Kleen:             various fixes.
45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #include <linux/config.h>
54
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61
62 #include <net/icmp.h>
63 #include <net/tcp.h>
64 #include <net/ipv6.h>
65 #include <net/inet_common.h>
66
67 #include <linux/inet.h>
68 #include <linux/stddef.h>
69 #include <linux/ipsec.h>
70
71 extern int sysctl_ip_dynaddr;
72 extern int sysctl_ip_default_ttl;
73 int sysctl_tcp_tw_reuse = 0;
74 int sysctl_tcp_low_latency = 0;
75
76 /* Check TCP sequence numbers in ICMP packets. */
77 #define ICMP_MIN_LENGTH 8
78
79 /* Socket used for sending RSTs */      
80 static struct inode tcp_inode;
81 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
82
83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 
84                        struct sk_buff *skb);
85
86 /*
87  * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
88  */
89 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
90         __tcp_ehash:          NULL,
91         __tcp_bhash:          NULL,
92         __tcp_bhash_size:     0,
93         __tcp_ehash_size:     0,
94         __tcp_listening_hash: { NULL, },
95         __tcp_lhash_lock:     RW_LOCK_UNLOCKED,
96         __tcp_lhash_users:    ATOMIC_INIT(0),
97         __tcp_lhash_wait:
98           __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
99         __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
100 };
101
102 /*
103  * This array holds the first and last local port number.
104  * For high-usage systems, use sysctl to change this to
105  * 32768-61000
106  */
107 int sysctl_local_port_range[2] = { 1024, 4999 };
108 int tcp_port_rover = (1024 - 1);
109
110 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
111                                  __u32 faddr, __u16 fport)
112 {
113         int h = ((laddr ^ lport) ^ (faddr ^ fport));
114         h ^= h>>16;
115         h ^= h>>8;
116         return h & (tcp_ehash_size - 1);
117 }
118
119 static __inline__ int tcp_sk_hashfn(struct sock *sk)
120 {
121         __u32 laddr = sk->rcv_saddr;
122         __u16 lport = sk->num;
123         __u32 faddr = sk->daddr;
124         __u16 fport = sk->dport;
125
126         return tcp_hashfn(laddr, lport, faddr, fport);
127 }
128
129 /* Allocate and initialize a new TCP local port bind bucket.
130  * The bindhash mutex for snum's hash chain must be held here.
131  */
132 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133                                           unsigned short snum)
134 {
135         struct tcp_bind_bucket *tb;
136
137         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
138         if(tb != NULL) {
139                 tb->port = snum;
140                 tb->fastreuse = 0;
141                 tb->owners = NULL;
142                 if((tb->next = head->chain) != NULL)
143                         tb->next->pprev = &tb->next;
144                 head->chain = tb;
145                 tb->pprev = &head->chain;
146         }
147         return tb;
148 }
149
150 /* Caller must disable local BH processing. */
151 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
152 {
153         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
154         struct tcp_bind_bucket *tb;
155
156         spin_lock(&head->lock);
157         tb = (struct tcp_bind_bucket *)sk->prev;
158         if ((child->bind_next = tb->owners) != NULL)
159                 tb->owners->bind_pprev = &child->bind_next;
160         tb->owners = child;
161         child->bind_pprev = &tb->owners;
162         child->prev = (struct sock *) tb;
163         spin_unlock(&head->lock);
164 }
165
166 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
167 {
168         local_bh_disable();
169         __tcp_inherit_port(sk, child);
170         local_bh_enable();
171 }
172
173 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
174 {
175         sk->num = snum;
176         if ((sk->bind_next = tb->owners) != NULL)
177                 tb->owners->bind_pprev = &sk->bind_next;
178         tb->owners = sk;
179         sk->bind_pprev = &tb->owners;
180         sk->prev = (struct sock *) tb;
181 }
182
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 {
185         struct sock *sk2 = tb->owners;
186         int sk_reuse = sk->reuse;
187         
188         for( ; sk2 != NULL; sk2 = sk2->bind_next) {
189                 if (sk != sk2 &&
190                     sk2->reuse <= 1 &&
191                     !ipv6_only_sock(sk2) &&
192                     (!sk->bound_dev_if ||
193                      !sk2->bound_dev_if ||
194                      sk->bound_dev_if == sk2->bound_dev_if)) {
195                         if (!sk_reuse   ||
196                             !sk2->reuse ||
197                             sk2->state == TCP_LISTEN) {
198                                 if (!sk2->rcv_saddr     ||
199                                     !sk->rcv_saddr      ||
200                                     (sk2->rcv_saddr == sk->rcv_saddr))
201                                         break;
202                         }
203                 }
204         }
205         return sk2 != NULL;
206 }
207
208 /* Obtain a reference to a local port for the given sock,
209  * if snum is zero it means select any available local port.
210  */
211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 {
213         struct tcp_bind_hashbucket *head;
214         struct tcp_bind_bucket *tb;
215         int ret;
216
217         local_bh_disable();
218         if (snum == 0) {
219                 int low = sysctl_local_port_range[0];
220                 int high = sysctl_local_port_range[1];
221                 int remaining = (high - low) + 1;
222                 int rover;
223
224                 spin_lock(&tcp_portalloc_lock);
225                 rover = tcp_port_rover;
226                 do {    rover++;
227                         if ((rover < low) || (rover > high))
228                                 rover = low;
229                         head = &tcp_bhash[tcp_bhashfn(rover)];
230                         spin_lock(&head->lock);
231                         for (tb = head->chain; tb; tb = tb->next)
232                                 if (tb->port == rover)
233                                         goto next;
234                         break;
235                 next:
236                         spin_unlock(&head->lock);
237                 } while (--remaining > 0);
238                 tcp_port_rover = rover;
239                 spin_unlock(&tcp_portalloc_lock);
240
241                 /* Exhausted local port range during search? */
242                 ret = 1;
243                 if (remaining <= 0)
244                         goto fail;
245
246                 /* OK, here is the one we will use.  HEAD is
247                  * non-NULL and we hold it's mutex.
248                  */
249                 snum = rover;
250                 tb = NULL;
251         } else {
252                 head = &tcp_bhash[tcp_bhashfn(snum)];
253                 spin_lock(&head->lock);
254                 for (tb = head->chain; tb != NULL; tb = tb->next)
255                         if (tb->port == snum)
256                                 break;
257         }
258         if (tb != NULL && tb->owners != NULL) {
259                 if (sk->reuse > 1)
260                         goto success;
261                 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
262                         goto success;
263                 } else {
264                         ret = 1;
265                         if (tcp_bind_conflict(sk, tb))
266                                 goto fail_unlock;
267                 }
268         }
269         ret = 1;
270         if (tb == NULL &&
271             (tb = tcp_bucket_create(head, snum)) == NULL)
272                         goto fail_unlock;
273         if (tb->owners == NULL) {
274                 if (sk->reuse && sk->state != TCP_LISTEN)
275                         tb->fastreuse = 1;
276                 else
277                         tb->fastreuse = 0;
278         } else if (tb->fastreuse &&
279                    ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
280                 tb->fastreuse = 0;
281 success:
282         if (sk->prev == NULL)
283                 tcp_bind_hash(sk, tb, snum);
284         BUG_TRAP(sk->prev == (struct sock *) tb);
285         ret = 0;
286
287 fail_unlock:
288         spin_unlock(&head->lock);
289 fail:
290         local_bh_enable();
291         return ret;
292 }
293
294 /* Get rid of any references to a local port held by the
295  * given sock.
296  */
297 inline void __tcp_put_port(struct sock *sk)
298 {
299         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
300         struct tcp_bind_bucket *tb;
301
302         spin_lock(&head->lock);
303         tb = (struct tcp_bind_bucket *) sk->prev;
304         if (sk->bind_next)
305                 sk->bind_next->bind_pprev = sk->bind_pprev;
306         *(sk->bind_pprev) = sk->bind_next;
307         sk->prev = NULL;
308         sk->num = 0;
309         if (tb->owners == NULL) {
310                 if (tb->next)
311                         tb->next->pprev = tb->pprev;
312                 *(tb->pprev) = tb->next;
313                 kmem_cache_free(tcp_bucket_cachep, tb);
314         }
315         spin_unlock(&head->lock);
316 }
317
318 void tcp_put_port(struct sock *sk)
319 {
320         local_bh_disable();
321         __tcp_put_port(sk);
322         local_bh_enable();
323 }
324
325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
326  * Look, when several writers sleep and reader wakes them up, all but one
327  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
328  * this, _but_ remember, it adds useless work on UP machines (wake up each
329  * exclusive lock release). It should be ifdefed really.
330  */
331
332 void tcp_listen_wlock(void)
333 {
334         write_lock(&tcp_lhash_lock);
335
336         if (atomic_read(&tcp_lhash_users)) {
337                 DECLARE_WAITQUEUE(wait, current);
338
339                 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
340                 for (;;) {
341                         set_current_state(TASK_UNINTERRUPTIBLE);
342                         if (atomic_read(&tcp_lhash_users) == 0)
343                                 break;
344                         write_unlock_bh(&tcp_lhash_lock);
345                         schedule();
346                         write_lock_bh(&tcp_lhash_lock);
347                 }
348
349                 __set_current_state(TASK_RUNNING);
350                 remove_wait_queue(&tcp_lhash_wait, &wait);
351         }
352 }
353
354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355 {
356         struct sock **skp;
357         rwlock_t *lock;
358
359         BUG_TRAP(sk->pprev==NULL);
360         if(listen_possible && sk->state == TCP_LISTEN) {
361                 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362                 lock = &tcp_lhash_lock;
363                 tcp_listen_wlock();
364         } else {
365                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
366                 lock = &tcp_ehash[sk->hashent].lock;
367                 write_lock(lock);
368         }
369         if((sk->next = *skp) != NULL)
370                 (*skp)->pprev = &sk->next;
371         *skp = sk;
372         sk->pprev = skp;
373         sock_prot_inc_use(sk->prot);
374         write_unlock(lock);
375         if (listen_possible && sk->state == TCP_LISTEN)
376                 wake_up(&tcp_lhash_wait);
377 }
378
379 static void tcp_v4_hash(struct sock *sk)
380 {
381         if (sk->state != TCP_CLOSE) {
382                 local_bh_disable();
383                 __tcp_v4_hash(sk, 1);
384                 local_bh_enable();
385         }
386 }
387
388 void tcp_unhash(struct sock *sk)
389 {
390         rwlock_t *lock;
391
392         if (!sk->pprev)
393                 goto ende;
394
395         if (sk->state == TCP_LISTEN) {
396                 local_bh_disable();
397                 tcp_listen_wlock();
398                 lock = &tcp_lhash_lock;
399         } else {
400                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
401                 lock = &head->lock;
402                 write_lock_bh(&head->lock);
403         }
404
405         if(sk->pprev) {
406                 if(sk->next)
407                         sk->next->pprev = sk->pprev;
408                 *sk->pprev = sk->next;
409                 sk->pprev = NULL;
410                 sock_prot_dec_use(sk->prot);
411         }
412         write_unlock_bh(lock);
413
414  ende:
415         if (sk->state == TCP_LISTEN)
416                 wake_up(&tcp_lhash_wait);
417 }
418
419 /* Don't inline this cruft.  Here are some nice properties to
420  * exploit here.  The BSD API does not allow a listening TCP
421  * to specify the remote port nor the remote address for the
422  * connection.  So always assume those are both wildcarded
423  * during the search since they can never be otherwise.
424  */
425 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
426 {
427         struct sock *result = NULL;
428         int score, hiscore;
429
430         hiscore=-1;
431         for(; sk; sk = sk->next) {
432                 if(sk->num == hnum && !ipv6_only_sock(sk)) {
433                         __u32 rcv_saddr = sk->rcv_saddr;
434
435 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
436                         score = sk->family == PF_INET ? 1 : 0;
437 #else
438                         score = 1;
439 #endif
440                         if(rcv_saddr) {
441                                 if (rcv_saddr != daddr)
442                                         continue;
443                                 score+=2;
444                         }
445                         if (sk->bound_dev_if) {
446                                 if (sk->bound_dev_if != dif)
447                                         continue;
448                                 score+=2;
449                         }
450                         if (score == 5)
451                                 return sk;
452                         if (score > hiscore) {
453                                 hiscore = score;
454                                 result = sk;
455                         }
456                 }
457         }
458         return result;
459 }
460
461 /* Optimize the common listener case. */
462 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
463 {
464         struct sock *sk;
465
466         read_lock(&tcp_lhash_lock);
467         sk = tcp_listening_hash[tcp_lhashfn(hnum)];
468         if (sk) {
469                 if (sk->num == hnum &&
470                     sk->next == NULL &&
471                     (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
472                     (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
473                     !sk->bound_dev_if)
474                         goto sherry_cache;
475                 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
476         }
477         if (sk) {
478 sherry_cache:
479                 sock_hold(sk);
480         }
481         read_unlock(&tcp_lhash_lock);
482         return sk;
483 }
484
485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487  *
488  * Local BH must be disabled here.
489  */
490
491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492                                                        u32 daddr, u16 hnum, int dif)
493 {
494         struct tcp_ehash_bucket *head;
495         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
496         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
497         struct sock *sk;
498         int hash;
499
500         /* Optimize here for direct hit, only listening connections can
501          * have wildcards anyways.
502          */
503         hash = tcp_hashfn(daddr, hnum, saddr, sport);
504         head = &tcp_ehash[hash];
505         read_lock(&head->lock);
506         for(sk = head->chain; sk; sk = sk->next) {
507                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508                         goto hit; /* You sunk my battleship! */
509         }
510
511         /* Must check for a TIME_WAIT'er before going to listener hash. */
512         for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
513                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
514                         goto hit;
515         read_unlock(&head->lock);
516
517         return NULL;
518
519 hit:
520         sock_hold(sk);
521         read_unlock(&head->lock);
522         return sk;
523 }
524
525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526                                            u32 daddr, u16 hnum, int dif)
527 {
528         struct sock *sk;
529
530         sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
531
532         if (sk)
533                 return sk;
534                 
535         return tcp_v4_lookup_listener(daddr, hnum, dif);
536 }
537
538 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
539 {
540         struct sock *sk;
541
542         local_bh_disable();
543         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
544         local_bh_enable();
545
546         return sk;
547 }
548
549 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
550 {
551         return secure_tcp_sequence_number(skb->nh.iph->daddr,
552                                           skb->nh.iph->saddr,
553                                           skb->h.th->dest,
554                                           skb->h.th->source);
555 }
556
557 /* called with local bh disabled */
558 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
559                                       struct tcp_tw_bucket **twp)
560 {
561         u32 daddr = sk->rcv_saddr;
562         u32 saddr = sk->daddr;
563         int dif = sk->bound_dev_if;
564         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565         __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
566         int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
567         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568         struct sock *sk2, **skp;
569         struct tcp_tw_bucket *tw;
570
571         write_lock(&head->lock);
572
573         /* Check TIME-WAIT sockets first. */
574         for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
575             skp = &sk2->next) {
576                 tw = (struct tcp_tw_bucket*)sk2;
577
578                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
580
581                         /* With PAWS, it is safe from the viewpoint
582                            of data integrity. Even without PAWS it
583                            is safe provided sequence spaces do not
584                            overlap i.e. at data rates <= 80Mbit/sec.
585
586                            Actually, the idea is close to VJ's one,
587                            only timestamp cache is held not per host,
588                            but per port pair and TW bucket is used
589                            as state holder.
590
591                            If TW bucket has been already destroyed we
592                            fall back to VJ's scheme and use initial
593                            timestamp retrieved from peer table.
594                          */
595                         if (tw->ts_recent_stamp &&
596                             (!twp || (sysctl_tcp_tw_reuse &&
597                                       xtime.tv_sec - tw->ts_recent_stamp > 1))) {
598                                 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
599                                         tp->write_seq = 1;
600                                 tp->ts_recent = tw->ts_recent;
601                                 tp->ts_recent_stamp = tw->ts_recent_stamp;
602                                 sock_hold(sk2);
603                                 skp = &head->chain;
604                                 goto unique;
605                         } else
606                                 goto not_unique;
607                 }
608         }
609         tw = NULL;
610
611         /* And established part... */
612         for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
613                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
614                         goto not_unique;
615         }
616
617 unique:
618         /* Must record num and sport now. Otherwise we will see
619          * in hash table socket with a funny identity. */
620         sk->num = lport;
621         sk->sport = htons(lport);
622         BUG_TRAP(sk->pprev==NULL);
623         if ((sk->next = *skp) != NULL)
624                 (*skp)->pprev = &sk->next;
625
626         *skp = sk;
627         sk->pprev = skp;
628         sk->hashent = hash;
629         sock_prot_inc_use(sk->prot);
630         write_unlock(&head->lock);
631
632         if (twp) {
633                 *twp = tw;
634                 NET_INC_STATS_BH(TimeWaitRecycled);
635         } else if (tw) {
636                 /* Silly. Should hash-dance instead... */
637                 tcp_tw_deschedule(tw);
638                 tcp_timewait_kill(tw);
639                 NET_INC_STATS_BH(TimeWaitRecycled);
640
641                 tcp_tw_put(tw);
642         }
643
644         return 0;
645
646 not_unique:
647         write_unlock(&head->lock);
648         return -EADDRNOTAVAIL;
649 }
650
651 /*
652  * Bind a port for a connect operation and hash it.
653  */
654 static int tcp_v4_hash_connect(struct sock *sk)
655 {
656         unsigned short snum = sk->num;
657         struct tcp_bind_hashbucket *head;
658         struct tcp_bind_bucket *tb;
659
660         if (snum == 0) {
661                 int rover;
662                 int low = sysctl_local_port_range[0];
663                 int high = sysctl_local_port_range[1];
664                 int remaining = (high - low) + 1;
665                 struct tcp_tw_bucket *tw = NULL;
666
667                 local_bh_disable();
668
669                 /* TODO. Actually it is not so bad idea to remove
670                  * tcp_portalloc_lock before next submission to Linus.
671                  * As soon as we touch this place at all it is time to think.
672                  *
673                  * Now it protects single _advisory_ variable tcp_port_rover,
674                  * hence it is mostly useless.
675                  * Code will work nicely if we just delete it, but
676                  * I am afraid in contented case it will work not better or
677                  * even worse: another cpu just will hit the same bucket
678                  * and spin there.
679                  * So some cpu salt could remove both contention and
680                  * memory pingpong. Any ideas how to do this in a nice way?
681                  */
682                 spin_lock(&tcp_portalloc_lock);
683                 rover = tcp_port_rover;
684
685                 do {
686                         rover++;
687                         if ((rover < low) || (rover > high))
688                                 rover = low;
689                         head = &tcp_bhash[tcp_bhashfn(rover)];
690                         spin_lock(&head->lock);         
691
692                         /* Does not bother with rcv_saddr checks,
693                          * because the established check is already
694                          * unique enough.
695                          */
696                         for (tb = head->chain; tb; tb = tb->next) {
697                                 if (tb->port == rover) {
698                                         BUG_TRAP(tb->owners != NULL);
699                                         if (tb->fastreuse >= 0)
700                                                 goto next_port;
701                                         if (!__tcp_v4_check_established(sk, rover, &tw))
702                                                 goto ok;
703                                         goto next_port;
704                                 }
705                         }
706
707                         tb = tcp_bucket_create(head, rover);
708                         if (!tb) {
709                                 spin_unlock(&head->lock);
710                                 break;
711                         }
712                         tb->fastreuse = -1;
713                         goto ok;
714
715                 next_port:
716                         spin_unlock(&head->lock);
717                 } while (--remaining > 0);
718                 tcp_port_rover = rover;
719                 spin_unlock(&tcp_portalloc_lock);
720
721                 local_bh_enable();
722
723                 return -EADDRNOTAVAIL;
724
725         ok:
726                 /* All locks still held and bhs disabled */
727                 tcp_port_rover = rover;
728                 spin_unlock(&tcp_portalloc_lock);
729
730                 tcp_bind_hash(sk, tb, rover);
731                 if (!sk->pprev) {
732                         sk->sport = htons(rover);
733                         __tcp_v4_hash(sk, 0);
734                 }
735                 spin_unlock(&head->lock);
736
737                 if (tw) {
738                         tcp_tw_deschedule(tw);
739                         tcp_timewait_kill(tw);
740                         tcp_tw_put(tw);
741                 }
742
743                 local_bh_enable();
744                 return 0;
745         }
746
747         head  = &tcp_bhash[tcp_bhashfn(snum)];
748         tb  = (struct tcp_bind_bucket *)sk->prev;
749         spin_lock_bh(&head->lock);
750         if (tb->owners == sk && sk->bind_next == NULL) {
751                 __tcp_v4_hash(sk, 0);
752                 spin_unlock_bh(&head->lock);
753                 return 0;
754         } else {
755                 int ret;
756                 spin_unlock(&head->lock);
757                 /* No definite answer... Walk to established hash table */
758                 ret = __tcp_v4_check_established(sk, snum, NULL);
759                 local_bh_enable();
760                 return ret;
761         }
762 }
763
764 /* This will initiate an outgoing connection. */
765 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
766 {
767         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
769         struct rtable *rt;
770         u32 daddr, nexthop;
771         int tmp;
772         int err;
773
774         if (addr_len < sizeof(struct sockaddr_in))
775                 return(-EINVAL);
776
777         if (usin->sin_family != AF_INET)
778                 return(-EAFNOSUPPORT);
779
780         nexthop = daddr = usin->sin_addr.s_addr;
781         if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
782                 if (daddr == 0)
783                         return -EINVAL;
784                 nexthop = sk->protinfo.af_inet.opt->faddr;
785         }
786
787         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
788                                RT_CONN_FLAGS(sk), sk->bound_dev_if);
789         if (tmp < 0)
790                 return tmp;
791
792         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
793                 ip_rt_put(rt);
794                 return -ENETUNREACH;
795         }
796
797         __sk_dst_set(sk, &rt->u.dst);
798         sk->route_caps = rt->u.dst.dev->features;
799
800         if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
801                 daddr = rt->rt_dst;
802
803         if (!sk->saddr)
804                 sk->saddr = rt->rt_src;
805         sk->rcv_saddr = sk->saddr;
806
807         if (tp->ts_recent_stamp && sk->daddr != daddr) {
808                 /* Reset inherited state */
809                 tp->ts_recent = 0;
810                 tp->ts_recent_stamp = 0;
811                 tp->write_seq = 0;
812         }
813
814         if (sysctl_tcp_tw_recycle &&
815             !tp->ts_recent_stamp &&
816             rt->rt_dst == daddr) {
817                 struct inet_peer *peer = rt_get_peer(rt);
818
819                 /* VJ's idea. We save last timestamp seen from
820                  * the destination in peer table, when entering state TIME-WAIT
821                  * and initialize ts_recent from it, when trying new connection.
822                  */
823
824                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
825                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
826                         tp->ts_recent = peer->tcp_ts;
827                 }
828         }
829
830         sk->dport = usin->sin_port;
831         sk->daddr = daddr;
832
833         tp->ext_header_len = 0;
834         if (sk->protinfo.af_inet.opt)
835                 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
836
837         tp->mss_clamp = 536;
838
839         /* Socket identity is still unknown (sport may be zero).
840          * However we set state to SYN-SENT and not releasing socket
841          * lock select source port, enter ourselves into the hash tables and
842          * complete initalization after this.
843          */
844         tcp_set_state(sk, TCP_SYN_SENT);
845         err = tcp_v4_hash_connect(sk);
846         if (err)
847                 goto failure;
848
849         if (!tp->write_seq)
850                 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
851                                                            sk->sport, usin->sin_port);
852
853         sk->protinfo.af_inet.id = tp->write_seq^jiffies;
854
855         err = tcp_connect(sk);
856         if (err)
857                 goto failure;
858
859         return 0;
860
861 failure:
862         tcp_set_state(sk, TCP_CLOSE);
863         __sk_dst_reset(sk);
864         sk->route_caps = 0;
865         sk->dport = 0;
866         return err;
867 }
868
869 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
870 {
871         return ((struct rtable*)skb->dst)->rt_iif;
872 }
873
874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
875 {
876         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
877 }
878
879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 
880                                               struct open_request ***prevp,
881                                               __u16 rport,
882                                               __u32 raddr, __u32 laddr)
883 {
884         struct tcp_listen_opt *lopt = tp->listen_opt;
885         struct open_request *req, **prev;  
886
887         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888              (req = *prev) != NULL;
889              prev = &req->dl_next) {
890                 if (req->rmt_port == rport &&
891                     req->af.v4_req.rmt_addr == raddr &&
892                     req->af.v4_req.loc_addr == laddr &&
893                     TCP_INET_FAMILY(req->class->family)) {
894                         BUG_TRAP(req->sk == NULL);
895                         *prevp = prev;
896                         return req; 
897                 }
898         }
899
900         return NULL;
901 }
902
903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
904 {
905         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906         struct tcp_listen_opt *lopt = tp->listen_opt;
907         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
908
909         req->expires = jiffies + TCP_TIMEOUT_INIT;
910         req->retrans = 0;
911         req->sk = NULL;
912         req->dl_next = lopt->syn_table[h];
913
914         write_lock(&tp->syn_wait_lock);
915         lopt->syn_table[h] = req;
916         write_unlock(&tp->syn_wait_lock);
917
918         tcp_synq_added(sk);
919 }
920
921
922 /* 
923  * This routine does path mtu discovery as defined in RFC1191.
924  */
925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
926 {
927         struct dst_entry *dst;
928         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
929
930         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931          * send out by Linux are always <576bytes so they should go through
932          * unfragmented).
933          */
934         if (sk->state == TCP_LISTEN)
935                 return; 
936
937         /* We don't check in the destentry if pmtu discovery is forbidden
938          * on this route. We just assume that no packet_to_big packets
939          * are send back when pmtu discovery is not active.
940          * There is a small race when the user changes this flag in the
941          * route, but I think that's acceptable.
942          */
943         if ((dst = __sk_dst_check(sk, 0)) == NULL)
944                 return;
945
946         ip_rt_update_pmtu(dst, mtu);
947
948         /* Something is about to be wrong... Remember soft error
949          * for the case, if this connection will not able to recover.
950          */
951         if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952                 sk->err_soft = EMSGSIZE;
953
954         if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955             tp->pmtu_cookie > dst->pmtu) {
956                 tcp_sync_mss(sk, dst->pmtu);
957
958                 /* Resend the TCP packet because it's  
959                  * clear that the old packet has been
960                  * dropped. This is the new "fast" path mtu
961                  * discovery.
962                  */
963                 tcp_simple_retransmit(sk);
964         } /* else let the usual retransmit timer handle it */
965 }
966
967 /*
968  * This routine is called by the ICMP module when it gets some
969  * sort of error condition.  If err < 0 then the socket should
970  * be closed and the error returned to the user.  If err > 0
971  * it's just the icmp type << 8 | icmp code.  After adjustment
972  * header points to the first 8 bytes of the tcp header.  We need
973  * to find the appropriate port.
974  *
975  * The locking strategy used here is very "optimistic". When
976  * someone else accesses the socket the ICMP is just dropped
977  * and for some paths there is no check at all.
978  * A more general error queue to queue errors for later handling
979  * is probably better.
980  *
981  */
982
983 void tcp_v4_err(struct sk_buff *skb, u32 info)
984 {
985         struct iphdr *iph = (struct iphdr*)skb->data;
986         struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
987         struct tcp_opt *tp;
988         int type = skb->h.icmph->type;
989         int code = skb->h.icmph->code;
990         struct sock *sk;
991         __u32 seq;
992         int err;
993
994         if (skb->len < (iph->ihl << 2) + 8) {
995                 ICMP_INC_STATS_BH(IcmpInErrors); 
996                 return;
997         }
998
999         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1000         if (sk == NULL) {
1001                 ICMP_INC_STATS_BH(IcmpInErrors);
1002                 return;
1003         }
1004         if (sk->state == TCP_TIME_WAIT) {
1005                 tcp_tw_put((struct tcp_tw_bucket*)sk);
1006                 return;
1007         }
1008
1009         bh_lock_sock(sk);
1010         /* If too many ICMPs get dropped on busy
1011          * servers this needs to be solved differently.
1012          */
1013         if (sk->lock.users != 0)
1014                 NET_INC_STATS_BH(LockDroppedIcmps);
1015
1016         if (sk->state == TCP_CLOSE)
1017                 goto out;
1018
1019         tp = &sk->tp_pinfo.af_tcp;
1020         seq = ntohl(th->seq);
1021         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022                 NET_INC_STATS(OutOfWindowIcmps);
1023                 goto out;
1024         }
1025
1026         switch (type) {
1027         case ICMP_SOURCE_QUENCH:
1028                 /* Just silently ignore these. */
1029                 goto out;
1030         case ICMP_PARAMETERPROB:
1031                 err = EPROTO;
1032                 break; 
1033         case ICMP_DEST_UNREACH:
1034                 if (code > NR_ICMP_UNREACH)
1035                         goto out;
1036
1037                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1038                         if (sk->lock.users == 0)
1039                                 do_pmtu_discovery(sk, iph, info);
1040                         goto out;
1041                 }
1042
1043                 err = icmp_err_convert[code].errno;
1044                 break;
1045         case ICMP_TIME_EXCEEDED:
1046                 err = EHOSTUNREACH;
1047                 break;
1048         default:
1049                 goto out;
1050         }
1051
1052         switch (sk->state) {
1053                 struct open_request *req, **prev;
1054         case TCP_LISTEN:
1055                 if (sk->lock.users != 0)
1056                         goto out;
1057
1058                 req = tcp_v4_search_req(tp, &prev,
1059                                         th->dest,
1060                                         iph->daddr, iph->saddr); 
1061                 if (!req)
1062                         goto out;
1063
1064                 /* ICMPs are not backlogged, hence we cannot get
1065                    an established socket here.
1066                  */
1067                 BUG_TRAP(req->sk == NULL);
1068
1069                 if (seq != req->snt_isn) {
1070                         NET_INC_STATS_BH(OutOfWindowIcmps);
1071                         goto out;
1072                 }
1073
1074                 /* 
1075                  * Still in SYN_RECV, just remove it silently.
1076                  * There is no good way to pass the error to the newly
1077                  * created socket, and POSIX does not want network
1078                  * errors returned from accept(). 
1079                  */ 
1080                 tcp_synq_drop(sk, req, prev);
1081                 goto out;
1082
1083         case TCP_SYN_SENT:
1084         case TCP_SYN_RECV:  /* Cannot happen.
1085                                It can f.e. if SYNs crossed.
1086                              */ 
1087                 if (sk->lock.users == 0) {
1088                         TCP_INC_STATS_BH(TcpAttemptFails);
1089                         sk->err = err;
1090
1091                         sk->error_report(sk);
1092
1093                         tcp_done(sk);
1094                 } else {
1095                         sk->err_soft = err;
1096                 }
1097                 goto out;
1098         }
1099
1100         /* If we've already connected we will keep trying
1101          * until we time out, or the user gives up.
1102          *
1103          * rfc1122 4.2.3.9 allows to consider as hard errors
1104          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1105          * but it is obsoleted by pmtu discovery).
1106          *
1107          * Note, that in modern internet, where routing is unreliable
1108          * and in each dark corner broken firewalls sit, sending random
1109          * errors ordered by their masters even this two messages finally lose
1110          * their original sense (even Linux sends invalid PORT_UNREACHs)
1111          *
1112          * Now we are in compliance with RFCs.
1113          *                                                      --ANK (980905)
1114          */
1115
1116         if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1117                 sk->err = err;
1118                 sk->error_report(sk);
1119         } else  { /* Only an error on timeout */
1120                 sk->err_soft = err;
1121         }
1122
1123 out:
1124         bh_unlock_sock(sk);
1125         sock_put(sk);
1126 }
1127
1128 /* This routine computes an IPv4 TCP checksum. */
1129 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 
1130                        struct sk_buff *skb)
1131 {
1132         if (skb->ip_summed == CHECKSUM_HW) {
1133                 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1134                 skb->csum = offsetof(struct tcphdr, check);
1135         } else {
1136                 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1137                                          csum_partial((char *)th, th->doff<<2, skb->csum));
1138         }
1139 }
1140
1141 /*
1142  *      This routine will send an RST to the other tcp.
1143  *
1144  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1145  *                    for reset.
1146  *      Answer: if a packet caused RST, it is not for a socket
1147  *              existing in our system, if it is matched to a socket,
1148  *              it is just duplicate segment or bug in other side's TCP.
1149  *              So that we build reply only basing on parameters
1150  *              arrived with segment.
1151  *      Exception: precedence violation. We do not implement it in any case.
1152  */
1153
1154 static void tcp_v4_send_reset(struct sk_buff *skb)
1155 {
1156         struct tcphdr *th = skb->h.th;
1157         struct tcphdr rth;
1158         struct ip_reply_arg arg;
1159
1160         /* Never send a reset in response to a reset. */
1161         if (th->rst)
1162                 return;
1163
1164         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1165                 return;
1166
1167         /* Swap the send and the receive. */
1168         memset(&rth, 0, sizeof(struct tcphdr)); 
1169         rth.dest = th->source;
1170         rth.source = th->dest; 
1171         rth.doff = sizeof(struct tcphdr)/4;
1172         rth.rst = 1;
1173
1174         if (th->ack) {
1175                 rth.seq = th->ack_seq;
1176         } else {
1177                 rth.ack = 1;
1178                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1179                                     + skb->len - (th->doff<<2));
1180         }
1181
1182         memset(&arg, 0, sizeof arg); 
1183         arg.iov[0].iov_base = (unsigned char *)&rth; 
1184         arg.iov[0].iov_len  = sizeof rth;
1185         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 
1186                                       skb->nh.iph->saddr, /*XXX*/
1187                                       sizeof(struct tcphdr),
1188                                       IPPROTO_TCP,
1189                                       0); 
1190         arg.n_iov = 1;
1191         arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
1192
1193         tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1194         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1195
1196         TCP_INC_STATS_BH(TcpOutSegs);
1197         TCP_INC_STATS_BH(TcpOutRsts);
1198 }
1199
1200 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1201    outside socket context is ugly, certainly. What can I do?
1202  */
1203
1204 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1205 {
1206         struct tcphdr *th = skb->h.th;
1207         struct {
1208                 struct tcphdr th;
1209                 u32 tsopt[3];
1210         } rep;
1211         struct ip_reply_arg arg;
1212
1213         memset(&rep.th, 0, sizeof(struct tcphdr));
1214         memset(&arg, 0, sizeof arg);
1215
1216         arg.iov[0].iov_base = (unsigned char *)&rep; 
1217         arg.iov[0].iov_len  = sizeof(rep.th);
1218         arg.n_iov = 1;
1219         if (ts) {
1220                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1221                                      (TCPOPT_NOP << 16) |
1222                                      (TCPOPT_TIMESTAMP << 8) |
1223                                      TCPOLEN_TIMESTAMP);
1224                 rep.tsopt[1] = htonl(tcp_time_stamp);
1225                 rep.tsopt[2] = htonl(ts);
1226                 arg.iov[0].iov_len = sizeof(rep);
1227         }
1228
1229         /* Swap the send and the receive. */
1230         rep.th.dest = th->source;
1231         rep.th.source = th->dest; 
1232         rep.th.doff = arg.iov[0].iov_len/4;
1233         rep.th.seq = htonl(seq);
1234         rep.th.ack_seq = htonl(ack);
1235         rep.th.ack = 1;
1236         rep.th.window = htons(win);
1237
1238         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 
1239                                       skb->nh.iph->saddr, /*XXX*/
1240                                       arg.iov[0].iov_len,
1241                                       IPPROTO_TCP,
1242                                       0);
1243         arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
1244
1245         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1246
1247         TCP_INC_STATS_BH(TcpOutSegs);
1248 }
1249
1250 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1251 {
1252         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1253
1254         tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1255                         tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1256
1257         tcp_tw_put(tw);
1258 }
1259
1260 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1261 {
1262         tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1263                         req->ts_recent);
1264 }
1265
1266 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1267 {
1268         struct rtable *rt;
1269         struct ip_options *opt;
1270
1271         opt = req->af.v4_req.opt;
1272         if(ip_route_output(&rt, ((opt && opt->srr) ?
1273                                  opt->faddr :
1274                                  req->af.v4_req.rmt_addr),
1275                            req->af.v4_req.loc_addr,
1276                            RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1277                 IP_INC_STATS_BH(IpOutNoRoutes);
1278                 return NULL;
1279         }
1280         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1281                 ip_rt_put(rt);
1282                 IP_INC_STATS_BH(IpOutNoRoutes);
1283                 return NULL;
1284         }
1285         return &rt->u.dst;
1286 }
1287
1288 /*
1289  *      Send a SYN-ACK after having received an ACK. 
1290  *      This still operates on a open_request only, not on a big
1291  *      socket.
1292  */ 
1293 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1294                               struct dst_entry *dst)
1295 {
1296         int err = -1;
1297         struct sk_buff * skb;
1298
1299         /* First, grab a route. */
1300         if (dst == NULL &&
1301             (dst = tcp_v4_route_req(sk, req)) == NULL)
1302                 goto out;
1303
1304         skb = tcp_make_synack(sk, dst, req);
1305
1306         if (skb) {
1307                 struct tcphdr *th = skb->h.th;
1308
1309                 th->check = tcp_v4_check(th, skb->len,
1310                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1311                                          csum_partial((char *)th, skb->len, skb->csum));
1312
1313                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1314                                             req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1315                 if (err == NET_XMIT_CN)
1316                         err = 0;
1317         }
1318
1319 out:
1320         dst_release(dst);
1321         return err;
1322 }
1323
1324 /*
1325  *      IPv4 open_request destructor.
1326  */ 
1327 static void tcp_v4_or_free(struct open_request *req)
1328 {
1329         if (req->af.v4_req.opt)
1330                 kfree(req->af.v4_req.opt);
1331 }
1332
1333 static inline void syn_flood_warning(struct sk_buff *skb)
1334 {
1335         static unsigned long warntime;
1336         
1337         if (jiffies - warntime > HZ*60) {
1338                 warntime = jiffies;
1339                 printk(KERN_INFO 
1340                        "possible SYN flooding on port %d. Sending cookies.\n",  
1341                        ntohs(skb->h.th->dest));
1342         }
1343 }
1344
1345 /* 
1346  * Save and compile IPv4 options into the open_request if needed. 
1347  */
1348 static inline struct ip_options * 
1349 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1350 {
1351         struct ip_options *opt = &(IPCB(skb)->opt);
1352         struct ip_options *dopt = NULL; 
1353
1354         if (opt && opt->optlen) {
1355                 int opt_size = optlength(opt); 
1356                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1357                 if (dopt) {
1358                         if (ip_options_echo(dopt, skb)) {
1359                                 kfree(dopt);
1360                                 dopt = NULL;
1361                         }
1362                 }
1363         }
1364         return dopt;
1365 }
1366
1367 /* 
1368  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1369  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1370  * It would be better to replace it with a global counter for all sockets
1371  * but then some measure against one socket starving all other sockets
1372  * would be needed.
1373  *
1374  * It was 128 by default. Experiments with real servers show, that
1375  * it is absolutely not enough even at 100conn/sec. 256 cures most
1376  * of problems. This value is adjusted to 128 for very small machines
1377  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1378  * Further increasing requires to change hash table size.
1379  */
1380 int sysctl_max_syn_backlog = 256; 
1381
1382 struct or_calltable or_ipv4 = {
1383         PF_INET,
1384         tcp_v4_send_synack,
1385         tcp_v4_or_send_ack,
1386         tcp_v4_or_free,
1387         tcp_v4_send_reset
1388 };
1389
1390 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1391 {
1392         struct tcp_opt tp;
1393         struct open_request *req;
1394         __u32 saddr = skb->nh.iph->saddr;
1395         __u32 daddr = skb->nh.iph->daddr;
1396         __u32 isn = TCP_SKB_CB(skb)->when;
1397         struct dst_entry *dst = NULL;
1398 #ifdef CONFIG_SYN_COOKIES
1399         int want_cookie = 0;
1400 #else
1401 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1402 #endif
1403
1404         /* Never answer to SYNs send to broadcast or multicast */
1405         if (((struct rtable *)skb->dst)->rt_flags & 
1406             (RTCF_BROADCAST|RTCF_MULTICAST))
1407                 goto drop; 
1408
1409         /* TW buckets are converted to open requests without
1410          * limitations, they conserve resources and peer is
1411          * evidently real one.
1412          */
1413         if (tcp_synq_is_full(sk) && !isn) {
1414 #ifdef CONFIG_SYN_COOKIES
1415                 if (sysctl_tcp_syncookies) {
1416                         want_cookie = 1; 
1417                 } else
1418 #endif
1419                 goto drop;
1420         }
1421
1422         /* Accept backlog is full. If we have already queued enough
1423          * of warm entries in syn queue, drop request. It is better than
1424          * clogging syn queue with openreqs with exponentially increasing
1425          * timeout.
1426          */
1427         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1428                 goto drop;
1429
1430         req = tcp_openreq_alloc();
1431         if (req == NULL)
1432                 goto drop;
1433
1434         tcp_clear_options(&tp);
1435         tp.mss_clamp = 536;
1436         tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1437
1438         tcp_parse_options(skb, &tp, 0);
1439
1440         if (want_cookie) {
1441                 tcp_clear_options(&tp);
1442                 tp.saw_tstamp = 0;
1443         }
1444
1445         if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1446                 /* Some OSes (unknown ones, but I see them on web server, which
1447                  * contains information interesting only for windows'
1448                  * users) do not send their stamp in SYN. It is easy case.
1449                  * We simply do not advertise TS support.
1450                  */
1451                 tp.saw_tstamp = 0;
1452                 tp.tstamp_ok = 0;
1453         }
1454         tp.tstamp_ok = tp.saw_tstamp;
1455
1456         tcp_openreq_init(req, &tp, skb);
1457
1458         req->af.v4_req.loc_addr = daddr;
1459         req->af.v4_req.rmt_addr = saddr;
1460         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1461         req->class = &or_ipv4;
1462         if (!want_cookie)
1463                 TCP_ECN_create_request(req, skb->h.th);
1464
1465         if (want_cookie) {
1466 #ifdef CONFIG_SYN_COOKIES
1467                 syn_flood_warning(skb);
1468 #endif
1469                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1470         } else if (isn == 0) {
1471                 struct inet_peer *peer = NULL;
1472
1473                 /* VJ's idea. We save last timestamp seen
1474                  * from the destination in peer table, when entering
1475                  * state TIME-WAIT, and check against it before
1476                  * accepting new connection request.
1477                  *
1478                  * If "isn" is not zero, this request hit alive
1479                  * timewait bucket, so that all the necessary checks
1480                  * are made in the function processing timewait state.
1481                  */
1482                 if (tp.saw_tstamp &&
1483                     sysctl_tcp_tw_recycle &&
1484                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1485                     (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1486                     peer->v4daddr == saddr) {
1487                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1488                             (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1489                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1490                                 dst_release(dst);
1491                                 goto drop_and_free;
1492                         }
1493                 }
1494                 /* Kill the following clause, if you dislike this way. */
1495                 else if (!sysctl_tcp_syncookies &&
1496                          (sysctl_max_syn_backlog - tcp_synq_len(sk)
1497                           < (sysctl_max_syn_backlog>>2)) &&
1498                          (!peer || !peer->tcp_ts_stamp) &&
1499                          (!dst || !dst->rtt)) {
1500                         /* Without syncookies last quarter of
1501                          * backlog is filled with destinations, proven to be alive.
1502                          * It means that we continue to communicate
1503                          * to destinations, already remembered
1504                          * to the moment of synflood.
1505                          */
1506                         NETDEBUG(if (net_ratelimit()) \
1507                                 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1508                                         NIPQUAD(saddr), ntohs(skb->h.th->source)));
1509                         dst_release(dst);
1510                         goto drop_and_free;
1511                 }
1512
1513                 isn = tcp_v4_init_sequence(sk, skb);
1514         }
1515         req->snt_isn = isn;
1516
1517         if (tcp_v4_send_synack(sk, req, dst))
1518                 goto drop_and_free;
1519
1520         if (want_cookie) {
1521                 tcp_openreq_free(req); 
1522         } else {
1523                 tcp_v4_synq_add(sk, req);
1524         }
1525         return 0;
1526
1527 drop_and_free:
1528         tcp_openreq_free(req); 
1529 drop:
1530         TCP_INC_STATS_BH(TcpAttemptFails);
1531         return 0;
1532 }
1533
1534
1535 /* 
1536  * The three way handshake has completed - we got a valid synack - 
1537  * now create the new socket. 
1538  */
1539 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1540                                    struct open_request *req,
1541                                    struct dst_entry *dst)
1542 {
1543         struct tcp_opt *newtp;
1544         struct sock *newsk;
1545
1546         if (tcp_acceptq_is_full(sk))
1547                 goto exit_overflow;
1548
1549         if (dst == NULL &&
1550             (dst = tcp_v4_route_req(sk, req)) == NULL)
1551                 goto exit;
1552
1553         newsk = tcp_create_openreq_child(sk, req, skb);
1554         if (!newsk)
1555                 goto exit;
1556
1557         newsk->dst_cache = dst;
1558         newsk->route_caps = dst->dev->features;
1559
1560         newtp = &(newsk->tp_pinfo.af_tcp);
1561         newsk->daddr = req->af.v4_req.rmt_addr;
1562         newsk->saddr = req->af.v4_req.loc_addr;
1563         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1564         newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1565         req->af.v4_req.opt = NULL;
1566         newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1567         newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1568         newtp->ext_header_len = 0;
1569         if (newsk->protinfo.af_inet.opt)
1570                 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1571         newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1572
1573         tcp_sync_mss(newsk, dst->pmtu);
1574         newtp->advmss = dst->advmss;
1575         tcp_initialize_rcv_mss(newsk);
1576
1577         __tcp_v4_hash(newsk, 0);
1578         __tcp_inherit_port(sk, newsk);
1579
1580         return newsk;
1581
1582 exit_overflow:
1583         NET_INC_STATS_BH(ListenOverflows);
1584 exit:
1585         NET_INC_STATS_BH(ListenDrops);
1586         dst_release(dst);
1587         return NULL;
1588 }
1589
1590 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1591 {
1592         struct open_request *req, **prev;
1593         struct tcphdr *th = skb->h.th;
1594         struct iphdr *iph = skb->nh.iph;
1595         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1596         struct sock *nsk;
1597
1598         /* Find possible connection requests. */
1599         req = tcp_v4_search_req(tp, &prev,
1600                                 th->source,
1601                                 iph->saddr, iph->daddr);
1602         if (req)
1603                 return tcp_check_req(sk, skb, req, prev);
1604
1605         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1606                                           th->source,
1607                                           skb->nh.iph->daddr,
1608                                           ntohs(th->dest),
1609                                           tcp_v4_iif(skb));
1610
1611         if (nsk) {
1612                 if (nsk->state != TCP_TIME_WAIT) {
1613                         bh_lock_sock(nsk);
1614                         return nsk;
1615                 }
1616                 tcp_tw_put((struct tcp_tw_bucket*)nsk);
1617                 return NULL;
1618         }
1619
1620 #ifdef CONFIG_SYN_COOKIES
1621         if (!th->rst && !th->syn && th->ack)
1622                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1623 #endif
1624         return sk;
1625 }
1626
1627 static int tcp_v4_checksum_init(struct sk_buff *skb)
1628 {
1629         if (skb->ip_summed == CHECKSUM_HW) {
1630                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1631                 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1632                                   skb->nh.iph->daddr,skb->csum))
1633                         return 0;
1634
1635                 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1636                 skb->ip_summed = CHECKSUM_NONE;
1637         }
1638         if (skb->len <= 76) {
1639                 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1640                                  skb->nh.iph->daddr,
1641                                  skb_checksum(skb, 0, skb->len, 0)))
1642                         return -1;
1643                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1644         } else {
1645                 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1646                                           skb->nh.iph->daddr,0);
1647         }
1648         return 0;
1649 }
1650
1651
1652 /* The socket must have it's spinlock held when we get
1653  * here.
1654  *
1655  * We have a potential double-lock case here, so even when
1656  * doing backlog processing we use the BH locking scheme.
1657  * This is because we cannot sleep with the original spinlock
1658  * held.
1659  */
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662         IP_INC_STATS_BH(IpInDelivers);
1663
1664         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1665                 TCP_CHECK_TIMER(sk);
1666                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1667                         goto reset;
1668                 TCP_CHECK_TIMER(sk);
1669                 return 0; 
1670         }
1671
1672         if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1673                 goto csum_err;
1674
1675         if (sk->state == TCP_LISTEN) { 
1676                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1677                 if (!nsk)
1678                         goto discard;
1679
1680                 if (nsk != sk) {
1681                         if (tcp_child_process(sk, nsk, skb))
1682                                 goto reset;
1683                         return 0;
1684                 }
1685         }
1686
1687         TCP_CHECK_TIMER(sk);
1688         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1689                 goto reset;
1690         TCP_CHECK_TIMER(sk);
1691         return 0;
1692
1693 reset:
1694         tcp_v4_send_reset(skb);
1695 discard:
1696         kfree_skb(skb);
1697         /* Be careful here. If this function gets more complicated and
1698          * gcc suffers from register pressure on the x86, sk (in %ebx) 
1699          * might be destroyed here. This current version compiles correctly,
1700          * but you have been warned.
1701          */
1702         return 0;
1703
1704 csum_err:
1705         TCP_INC_STATS_BH(TcpInErrs);
1706         goto discard;
1707 }
1708
1709 /*
1710  *      From tcp_input.c
1711  */
1712
1713 int tcp_v4_rcv(struct sk_buff *skb)
1714 {
1715         struct tcphdr *th;
1716         struct sock *sk;
1717         int ret;
1718
1719         if (skb->pkt_type!=PACKET_HOST)
1720                 goto discard_it;
1721
1722         /* Count it even if it's bad */
1723         TCP_INC_STATS_BH(TcpInSegs);
1724
1725         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1726                 goto discard_it;
1727
1728         th = skb->h.th;
1729
1730         if (th->doff < sizeof(struct tcphdr)/4)
1731                 goto bad_packet;
1732         if (!pskb_may_pull(skb, th->doff*4))
1733                 goto discard_it;
1734
1735         /* An explanation is required here, I think.
1736          * Packet length and doff are validated by header prediction,
1737          * provided case of th->doff==0 is elimineted.
1738          * So, we defer the checks. */
1739         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1740              tcp_v4_checksum_init(skb) < 0))
1741                 goto bad_packet;
1742
1743         th = skb->h.th;
1744         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1745         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1746                                     skb->len - th->doff*4);
1747         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1748         TCP_SKB_CB(skb)->when = 0;
1749         TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1750         TCP_SKB_CB(skb)->sacked = 0;
1751
1752         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1753                              skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1754
1755         if (!sk)
1756                 goto no_tcp_socket;
1757
1758 process:
1759         if(!ipsec_sk_policy(sk,skb))
1760                 goto discard_and_relse;
1761
1762         if (sk->state == TCP_TIME_WAIT)
1763                 goto do_time_wait;
1764
1765         if (sk_filter(sk, skb, 0))
1766                 goto discard_and_relse;
1767
1768         skb->dev = NULL;
1769
1770         bh_lock_sock(sk);
1771         ret = 0;
1772         if (!sk->lock.users) {
1773                 if (!tcp_prequeue(sk, skb))
1774                         ret = tcp_v4_do_rcv(sk, skb);
1775         } else
1776                 sk_add_backlog(sk, skb);
1777         bh_unlock_sock(sk);
1778
1779         sock_put(sk);
1780
1781         return ret;
1782
1783 no_tcp_socket:
1784         if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1785 bad_packet:
1786                 TCP_INC_STATS_BH(TcpInErrs);
1787         } else {
1788                 tcp_v4_send_reset(skb);
1789         }
1790
1791 discard_it:
1792         /* Discard frame. */
1793         kfree_skb(skb);
1794         return 0;
1795
1796 discard_and_relse:
1797         sock_put(sk);
1798         goto discard_it;
1799
1800 do_time_wait:
1801         if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1802                 TCP_INC_STATS_BH(TcpInErrs);
1803                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1804                 goto discard_it;
1805         }
1806         switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1807                                           skb, th, skb->len)) {
1808         case TCP_TW_SYN:
1809         {
1810                 struct sock *sk2;
1811
1812                 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1813                 if (sk2 != NULL) {
1814                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1815                         tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1816                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1817                         sk = sk2;
1818                         goto process;
1819                 }
1820                 /* Fall through to ACK */
1821         }
1822         case TCP_TW_ACK:
1823                 tcp_v4_timewait_ack(sk, skb);
1824                 break;
1825         case TCP_TW_RST:
1826                 goto no_tcp_socket;
1827         case TCP_TW_SUCCESS:;
1828         }
1829         goto discard_it;
1830 }
1831
1832 /* With per-bucket locks this operation is not-atomic, so that
1833  * this version is not worse.
1834  */
1835 static void __tcp_v4_rehash(struct sock *sk)
1836 {
1837         sk->prot->unhash(sk);
1838         sk->prot->hash(sk);
1839 }
1840
1841 static int tcp_v4_reselect_saddr(struct sock *sk)
1842 {
1843         int err;
1844         struct rtable *rt;
1845         __u32 old_saddr = sk->saddr;
1846         __u32 new_saddr;
1847         __u32 daddr = sk->daddr;
1848
1849         if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1850                 daddr = sk->protinfo.af_inet.opt->faddr;
1851
1852         /* Query new route. */
1853         err = ip_route_connect(&rt, daddr, 0,
1854                                RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1855                                sk->bound_dev_if);
1856         if (err)
1857                 return err;
1858
1859         __sk_dst_set(sk, &rt->u.dst);
1860         sk->route_caps = rt->u.dst.dev->features;
1861
1862         new_saddr = rt->rt_src;
1863
1864         if (new_saddr == old_saddr)
1865                 return 0;
1866
1867         if (sysctl_ip_dynaddr > 1) {
1868                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1869                        "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1870                        NIPQUAD(old_saddr), 
1871                        NIPQUAD(new_saddr));
1872         }
1873
1874         sk->saddr = new_saddr;
1875         sk->rcv_saddr = new_saddr;
1876
1877         /* XXX The only one ugly spot where we need to
1878          * XXX really change the sockets identity after
1879          * XXX it has entered the hashes. -DaveM
1880          *
1881          * Besides that, it does not check for connection
1882          * uniqueness. Wait for troubles.
1883          */
1884         __tcp_v4_rehash(sk);
1885         return 0;
1886 }
1887
1888 int tcp_v4_rebuild_header(struct sock *sk)
1889 {
1890         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1891         u32 daddr;
1892         int err;
1893
1894         /* Route is OK, nothing to do. */
1895         if (rt != NULL)
1896                 return 0;
1897
1898         /* Reroute. */
1899         daddr = sk->daddr;
1900         if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1901                 daddr = sk->protinfo.af_inet.opt->faddr;
1902
1903         err = ip_route_output(&rt, daddr, sk->saddr,
1904                               RT_CONN_FLAGS(sk), sk->bound_dev_if);
1905         if (!err) {
1906                 __sk_dst_set(sk, &rt->u.dst);
1907                 sk->route_caps = rt->u.dst.dev->features;
1908                 return 0;
1909         }
1910
1911         /* Routing failed... */
1912         sk->route_caps = 0;
1913
1914         if (!sysctl_ip_dynaddr ||
1915             sk->state != TCP_SYN_SENT ||
1916             (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1917             (err = tcp_v4_reselect_saddr(sk)) != 0)
1918                 sk->err_soft=-err;
1919
1920         return err;
1921 }
1922
1923 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1924 {
1925         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1926
1927         sin->sin_family         = AF_INET;
1928         sin->sin_addr.s_addr    = sk->daddr;
1929         sin->sin_port           = sk->dport;
1930 }
1931
1932 /* VJ's idea. Save last timestamp seen from this destination
1933  * and hold it at least for normal timewait interval to use for duplicate
1934  * segment detection in subsequent connections, before they enter synchronized
1935  * state.
1936  */
1937
1938 int tcp_v4_remember_stamp(struct sock *sk)
1939 {
1940         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1941         struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1942         struct inet_peer *peer = NULL;
1943         int release_it = 0;
1944
1945         if (rt == NULL || rt->rt_dst != sk->daddr) {
1946                 peer = inet_getpeer(sk->daddr, 1);
1947                 release_it = 1;
1948         } else {
1949                 if (rt->peer == NULL)
1950                         rt_bind_peer(rt, 1);
1951                 peer = rt->peer;
1952         }
1953
1954         if (peer) {
1955                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1956                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1957                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1958                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
1959                         peer->tcp_ts = tp->ts_recent;
1960                 }
1961                 if (release_it)
1962                         inet_putpeer(peer);
1963                 return 1;
1964         }
1965
1966         return 0;
1967 }
1968
1969 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1970 {
1971         struct inet_peer *peer = NULL;
1972
1973         peer = inet_getpeer(tw->daddr, 1);
1974
1975         if (peer) {
1976                 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1977                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1978                      peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1979                         peer->tcp_ts_stamp = tw->ts_recent_stamp;
1980                         peer->tcp_ts = tw->ts_recent;
1981                 }
1982                 inet_putpeer(peer);
1983                 return 1;
1984         }
1985
1986         return 0;
1987 }
1988
1989 struct tcp_func ipv4_specific = {
1990         ip_queue_xmit,
1991         tcp_v4_send_check,
1992         tcp_v4_rebuild_header,
1993         tcp_v4_conn_request,
1994         tcp_v4_syn_recv_sock,
1995         tcp_v4_remember_stamp,
1996         sizeof(struct iphdr),
1997
1998         ip_setsockopt,
1999         ip_getsockopt,
2000         v4_addr2sockaddr,
2001         sizeof(struct sockaddr_in)
2002 };
2003
2004 /* NOTE: A lot of things set to zero explicitly by call to
2005  *       sk_alloc() so need not be done here.
2006  */
2007 static int tcp_v4_init_sock(struct sock *sk)
2008 {
2009         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2010
2011         skb_queue_head_init(&tp->out_of_order_queue);
2012         tcp_init_xmit_timers(sk);
2013         tcp_prequeue_init(tp);
2014
2015         tp->rto  = TCP_TIMEOUT_INIT;
2016         tp->mdev = TCP_TIMEOUT_INIT;
2017       
2018         /* So many TCP implementations out there (incorrectly) count the
2019          * initial SYN frame in their delayed-ACK and congestion control
2020          * algorithms that we must have the following bandaid to talk
2021          * efficiently to them.  -DaveM
2022          */
2023         tp->snd_cwnd = 2;
2024
2025         /* See draft-stevens-tcpca-spec-01 for discussion of the
2026          * initialization of these values.
2027          */
2028         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2029         tp->snd_cwnd_clamp = ~0;
2030         tp->mss_cache = 536;
2031
2032         tp->reordering = sysctl_tcp_reordering;
2033
2034         sk->state = TCP_CLOSE;
2035
2036         sk->write_space = tcp_write_space;
2037         sk->use_write_queue = 1;
2038
2039         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2040
2041         sk->sndbuf = sysctl_tcp_wmem[1];
2042         sk->rcvbuf = sysctl_tcp_rmem[1];
2043
2044         atomic_inc(&tcp_sockets_allocated);
2045
2046         return 0;
2047 }
2048
2049 static int tcp_v4_destroy_sock(struct sock *sk)
2050 {
2051         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2052
2053         tcp_clear_xmit_timers(sk);
2054
2055         /* Cleanup up the write buffer. */
2056         tcp_writequeue_purge(sk);
2057
2058         /* Cleans up our, hopefully empty, out_of_order_queue. */
2059         __skb_queue_purge(&tp->out_of_order_queue);
2060
2061         /* Clean prequeue, it must be empty really */
2062         __skb_queue_purge(&tp->ucopy.prequeue);
2063
2064         /* Clean up a referenced TCP bind bucket. */
2065         if(sk->prev != NULL)
2066                 tcp_put_port(sk);
2067
2068         /* If sendmsg cached page exists, toss it. */
2069         if (tp->sndmsg_page != NULL)
2070                 __free_page(tp->sndmsg_page);
2071
2072         atomic_dec(&tcp_sockets_allocated);
2073
2074         return 0;
2075 }
2076
2077 /* Proc filesystem TCP sock list dumping. */
2078 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2079 {
2080         int ttd = req->expires - jiffies;
2081
2082         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2083                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2084                 i,
2085                 req->af.v4_req.loc_addr,
2086                 ntohs(sk->sport),
2087                 req->af.v4_req.rmt_addr,
2088                 ntohs(req->rmt_port),
2089                 TCP_SYN_RECV,
2090                 0,0, /* could print option size, but that is af dependent. */
2091                 1,   /* timers active (only the expire timer) */  
2092                 ttd, 
2093                 req->retrans,
2094                 uid,
2095                 0,  /* non standard timer */  
2096                 0, /* open_requests have no inode */
2097                 atomic_read(&sk->refcnt),
2098                 req
2099                 ); 
2100 }
2101
2102 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2103 {
2104         unsigned int dest, src;
2105         __u16 destp, srcp;
2106         int timer_active;
2107         unsigned long timer_expires;
2108         struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2109
2110         dest  = sp->daddr;
2111         src   = sp->rcv_saddr;
2112         destp = ntohs(sp->dport);
2113         srcp  = ntohs(sp->sport);
2114         if (tp->pending == TCP_TIME_RETRANS) {
2115                 timer_active    = 1;
2116                 timer_expires   = tp->timeout;
2117         } else if (tp->pending == TCP_TIME_PROBE0) {
2118                 timer_active    = 4;
2119                 timer_expires   = tp->timeout;
2120         } else if (timer_pending(&sp->timer)) {
2121                 timer_active    = 2;
2122                 timer_expires   = sp->timer.expires;
2123         } else {
2124                 timer_active    = 0;
2125                 timer_expires = jiffies;
2126         }
2127
2128         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2129                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2130                 i, src, srcp, dest, destp, sp->state, 
2131                 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2132                 timer_active, timer_expires-jiffies,
2133                 tp->retransmits,
2134                 sock_i_uid(sp),
2135                 tp->probes_out,
2136                 sock_i_ino(sp),
2137                 atomic_read(&sp->refcnt), sp,
2138                 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2139                 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2140                 );
2141 }
2142
2143 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2144 {
2145         unsigned int dest, src;
2146         __u16 destp, srcp;
2147         int ttd = tw->ttd - jiffies;
2148
2149         if (ttd < 0)
2150                 ttd = 0;
2151
2152         dest  = tw->daddr;
2153         src   = tw->rcv_saddr;
2154         destp = ntohs(tw->dport);
2155         srcp  = ntohs(tw->sport);
2156
2157         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2158                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2159                 i, src, srcp, dest, destp, tw->substate, 0, 0,
2160                 3, ttd, 0, 0, 0, 0,
2161                 atomic_read(&tw->refcnt), tw);
2162 }
2163
2164 #define TMPSZ 150
2165
2166 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2167 {
2168         int len = 0, num = 0, i;
2169         off_t begin, pos = 0;
2170         char tmpbuf[TMPSZ+1];
2171
2172         if (offset < TMPSZ)
2173                 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2174                                "  sl  local_address rem_address   st tx_queue "
2175                                "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2176
2177         pos = TMPSZ;
2178
2179         /* First, walk listening socket table. */
2180         tcp_listen_lock();
2181         for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2182                 struct sock *sk;
2183                 struct tcp_listen_opt *lopt;
2184                 int k;
2185
2186                 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2187                         struct open_request *req;
2188                         int uid;
2189                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2190
2191                         if (!TCP_INET_FAMILY(sk->family))
2192                                 goto skip_listen;
2193
2194                         pos += TMPSZ;
2195                         if (pos >= offset) {
2196                                 get_tcp_sock(sk, tmpbuf, num);
2197                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2198                                 if (pos >= offset + length) {
2199                                         tcp_listen_unlock();
2200                                         goto out_no_bh;
2201                                 }
2202                         }
2203
2204 skip_listen:
2205                         uid = sock_i_uid(sk);
2206                         read_lock_bh(&tp->syn_wait_lock);
2207                         lopt = tp->listen_opt;
2208                         if (lopt && lopt->qlen != 0) {
2209                                 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2210                                         for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2211                                                 if (!TCP_INET_FAMILY(req->class->family))
2212                                                         continue;
2213
2214                                                 pos += TMPSZ;
2215                                                 if (pos <= offset)
2216                                                         continue;
2217                                                 get_openreq(sk, req, tmpbuf, num, uid);
2218                                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2219                                                 if (pos >= offset + length) {
2220                                                         read_unlock_bh(&tp->syn_wait_lock);
2221                                                         tcp_listen_unlock();
2222                                                         goto out_no_bh;
2223                                                 }
2224                                         }
2225                                 }
2226                         }
2227                         read_unlock_bh(&tp->syn_wait_lock);
2228
2229                         /* Completed requests are in normal socket hash table */
2230                 }
2231         }
2232         tcp_listen_unlock();
2233
2234         local_bh_disable();
2235
2236         /* Next, walk established hash chain. */
2237         for (i = 0; i < tcp_ehash_size; i++) {
2238                 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2239                 struct sock *sk;
2240                 struct tcp_tw_bucket *tw;
2241
2242                 read_lock(&head->lock);
2243                 for(sk = head->chain; sk; sk = sk->next, num++) {
2244                         if (!TCP_INET_FAMILY(sk->family))
2245                                 continue;
2246                         pos += TMPSZ;
2247                         if (pos <= offset)
2248                                 continue;
2249                         get_tcp_sock(sk, tmpbuf, num);
2250                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2251                         if (pos >= offset + length) {
2252                                 read_unlock(&head->lock);
2253                                 goto out;
2254                         }
2255                 }
2256                 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2257                      tw != NULL;
2258                      tw = (struct tcp_tw_bucket *)tw->next, num++) {
2259                         if (!TCP_INET_FAMILY(tw->family))
2260                                 continue;
2261                         pos += TMPSZ;
2262                         if (pos <= offset)
2263                                 continue;
2264                         get_timewait_sock(tw, tmpbuf, num);
2265                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2266                         if (pos >= offset + length) {
2267                                 read_unlock(&head->lock);
2268                                 goto out;
2269                         }
2270                 }
2271                 read_unlock(&head->lock);
2272         }
2273
2274 out:
2275         local_bh_enable();
2276 out_no_bh:
2277
2278         begin = len - (pos - offset);
2279         *start = buffer + begin;
2280         len -= begin;
2281         if (len > length)
2282                 len = length;
2283         if (len < 0)
2284                 len = 0; 
2285         return len;
2286 }
2287
2288 struct proto tcp_prot = {
2289         name:           "TCP",
2290         close:          tcp_close,
2291         connect:        tcp_v4_connect,
2292         disconnect:     tcp_disconnect,
2293         accept:         tcp_accept,
2294         ioctl:          tcp_ioctl,
2295         init:           tcp_v4_init_sock,
2296         destroy:        tcp_v4_destroy_sock,
2297         shutdown:       tcp_shutdown,
2298         setsockopt:     tcp_setsockopt,
2299         getsockopt:     tcp_getsockopt,
2300         sendmsg:        tcp_sendmsg,
2301         recvmsg:        tcp_recvmsg,
2302         backlog_rcv:    tcp_v4_do_rcv,
2303         hash:           tcp_v4_hash,
2304         unhash:         tcp_unhash,
2305         get_port:       tcp_v4_get_port,
2306 };
2307
2308
2309
2310 void __init tcp_v4_init(struct net_proto_family *ops)
2311 {
2312         int err;
2313
2314         tcp_inode.i_mode = S_IFSOCK;
2315         tcp_inode.i_sock = 1;
2316         tcp_inode.i_uid = 0;
2317         tcp_inode.i_gid = 0;
2318         init_waitqueue_head(&tcp_inode.i_wait);
2319         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2320
2321         tcp_socket->inode = &tcp_inode;
2322         tcp_socket->state = SS_UNCONNECTED;
2323         tcp_socket->type=SOCK_RAW;
2324
2325         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2326                 panic("Failed to create the TCP control socket.\n");
2327         tcp_socket->sk->allocation=GFP_ATOMIC;
2328         tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2329
2330         /* Unhash it so that IP input processing does not even
2331          * see it, we do not wish this socket to see incoming
2332          * packets.
2333          */
2334         tcp_socket->sk->prot->unhash(tcp_socket->sk);
2335 }