added a lot of printk output to ease writing of emulator
[linux-2.4.21-pre4.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.1.1.1 2005/04/11 02:51:13 jack Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an ACK bit.
36  *              Andi Kleen :            Implemented fast path mtu discovery.
37  *                                      Fixed many serious bugs in the
38  *                                      open_request handling and moved
39  *                                      most of it into the af independent code.
40  *                                      Added tail drop and some other bugfixes.
41  *                                      Added new listen sematics.
42  *              Mike McLagan    :       Routing by source
43  *      Juan Jose Ciarlante:            ip_dynaddr bits
44  *              Andi Kleen:             various fixes.
45  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #include <linux/config.h>
54
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/init.h>
60
61 #include <net/icmp.h>
62 #include <net/tcp.h>
63 #include <net/ipv6.h>
64 #include <net/inet_common.h>
65
66 #include <linux/inet.h>
67 #include <linux/stddef.h>
68 #include <linux/ipsec.h>
69
70 extern int sysctl_ip_dynaddr;
71 extern int sysctl_ip_default_ttl;
72 int sysctl_tcp_tw_reuse = 0;
73 int sysctl_tcp_low_latency = 0;
74
75 /* Check TCP sequence numbers in ICMP packets. */
76 #define ICMP_MIN_LENGTH 8
77
78 /* Socket used for sending RSTs */      
79 static struct inode tcp_inode;
80 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
81
82 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 
83                        struct sk_buff *skb);
84
85 /*
86  * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
87  */
88 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
89         __tcp_ehash:          NULL,
90         __tcp_bhash:          NULL,
91         __tcp_bhash_size:     0,
92         __tcp_ehash_size:     0,
93         __tcp_listening_hash: { NULL, },
94         __tcp_lhash_lock:     RW_LOCK_UNLOCKED,
95         __tcp_lhash_users:    ATOMIC_INIT(0),
96         __tcp_lhash_wait:
97           __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
98         __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
99 };
100
101 /*
102  * This array holds the first and last local port number.
103  * For high-usage systems, use sysctl to change this to
104  * 32768-61000
105  */
106 int sysctl_local_port_range[2] = { 1024, 4999 };
107 int tcp_port_rover = (1024 - 1);
108
109 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
110                                  __u32 faddr, __u16 fport)
111 {
112         int h = ((laddr ^ lport) ^ (faddr ^ fport));
113         h ^= h>>16;
114         h ^= h>>8;
115         return h & (tcp_ehash_size - 1);
116 }
117
118 static __inline__ int tcp_sk_hashfn(struct sock *sk)
119 {
120         __u32 laddr = sk->rcv_saddr;
121         __u16 lport = sk->num;
122         __u32 faddr = sk->daddr;
123         __u16 fport = sk->dport;
124
125         return tcp_hashfn(laddr, lport, faddr, fport);
126 }
127
128 /* Allocate and initialize a new TCP local port bind bucket.
129  * The bindhash mutex for snum's hash chain must be held here.
130  */
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
132                                           unsigned short snum)
133 {
134         struct tcp_bind_bucket *tb;
135
136         tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
137         if(tb != NULL) {
138                 tb->port = snum;
139                 tb->fastreuse = 0;
140                 tb->owners = NULL;
141                 if((tb->next = head->chain) != NULL)
142                         tb->next->pprev = &tb->next;
143                 head->chain = tb;
144                 tb->pprev = &head->chain;
145         }
146         return tb;
147 }
148
149 /* Caller must disable local BH processing. */
150 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
151 {
152         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
153         struct tcp_bind_bucket *tb;
154
155         spin_lock(&head->lock);
156         tb = (struct tcp_bind_bucket *)sk->prev;
157         if ((child->bind_next = tb->owners) != NULL)
158                 tb->owners->bind_pprev = &child->bind_next;
159         tb->owners = child;
160         child->bind_pprev = &tb->owners;
161         child->prev = (struct sock *) tb;
162         spin_unlock(&head->lock);
163 }
164
165 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
166 {
167         local_bh_disable();
168         __tcp_inherit_port(sk, child);
169         local_bh_enable();
170 }
171
172 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
173 {
174         sk->num = snum;
175         if ((sk->bind_next = tb->owners) != NULL)
176                 tb->owners->bind_pprev = &sk->bind_next;
177         tb->owners = sk;
178         sk->bind_pprev = &tb->owners;
179         sk->prev = (struct sock *) tb;
180 }
181
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183 {
184         struct sock *sk2 = tb->owners;
185         int sk_reuse = sk->reuse;
186         
187         for( ; sk2 != NULL; sk2 = sk2->bind_next) {
188                 if (sk != sk2 &&
189                     sk2->reuse <= 1 &&
190                     !ipv6_only_sock(sk2) &&
191                     sk->bound_dev_if == sk2->bound_dev_if) {
192                         if (!sk_reuse   ||
193                             !sk2->reuse ||
194                             sk2->state == TCP_LISTEN) {
195                                 if (!sk2->rcv_saddr     ||
196                                     !sk->rcv_saddr      ||
197                                     (sk2->rcv_saddr == sk->rcv_saddr))
198                                         break;
199                         }
200                 }
201         }
202         return sk2 != NULL;
203 }
204
205 /* Obtain a reference to a local port for the given sock,
206  * if snum is zero it means select any available local port.
207  */
208 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
209 {
210         struct tcp_bind_hashbucket *head;
211         struct tcp_bind_bucket *tb;
212         int ret;
213
214         local_bh_disable();
215         if (snum == 0) {
216                 int low = sysctl_local_port_range[0];
217                 int high = sysctl_local_port_range[1];
218                 int remaining = (high - low) + 1;
219                 int rover;
220
221                 spin_lock(&tcp_portalloc_lock);
222                 rover = tcp_port_rover;
223                 do {    rover++;
224                         if ((rover < low) || (rover > high))
225                                 rover = low;
226                         head = &tcp_bhash[tcp_bhashfn(rover)];
227                         spin_lock(&head->lock);
228                         for (tb = head->chain; tb; tb = tb->next)
229                                 if (tb->port == rover)
230                                         goto next;
231                         break;
232                 next:
233                         spin_unlock(&head->lock);
234                 } while (--remaining > 0);
235                 tcp_port_rover = rover;
236                 spin_unlock(&tcp_portalloc_lock);
237
238                 /* Exhausted local port range during search? */
239                 ret = 1;
240                 if (remaining <= 0)
241                         goto fail;
242
243                 /* OK, here is the one we will use.  HEAD is
244                  * non-NULL and we hold it's mutex.
245                  */
246                 snum = rover;
247                 tb = NULL;
248         } else {
249                 head = &tcp_bhash[tcp_bhashfn(snum)];
250                 spin_lock(&head->lock);
251                 for (tb = head->chain; tb != NULL; tb = tb->next)
252                         if (tb->port == snum)
253                                 break;
254         }
255         if (tb != NULL && tb->owners != NULL) {
256                 if (sk->reuse > 1)
257                         goto success;
258                 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
259                         goto success;
260                 } else {
261                         ret = 1;
262                         if (tcp_bind_conflict(sk, tb))
263                                 goto fail_unlock;
264                 }
265         }
266         ret = 1;
267         if (tb == NULL &&
268             (tb = tcp_bucket_create(head, snum)) == NULL)
269                         goto fail_unlock;
270         if (tb->owners == NULL) {
271                 if (sk->reuse && sk->state != TCP_LISTEN)
272                         tb->fastreuse = 1;
273                 else
274                         tb->fastreuse = 0;
275         } else if (tb->fastreuse &&
276                    ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
277                 tb->fastreuse = 0;
278 success:
279         if (sk->prev == NULL)
280                 tcp_bind_hash(sk, tb, snum);
281         BUG_TRAP(sk->prev == (struct sock *) tb);
282         ret = 0;
283
284 fail_unlock:
285         spin_unlock(&head->lock);
286 fail:
287         local_bh_enable();
288         return ret;
289 }
290
291 /* Get rid of any references to a local port held by the
292  * given sock.
293  */
294 __inline__ void __tcp_put_port(struct sock *sk)
295 {
296         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
297         struct tcp_bind_bucket *tb;
298
299         spin_lock(&head->lock);
300         tb = (struct tcp_bind_bucket *) sk->prev;
301         if (sk->bind_next)
302                 sk->bind_next->bind_pprev = sk->bind_pprev;
303         *(sk->bind_pprev) = sk->bind_next;
304         sk->prev = NULL;
305         sk->num = 0;
306         if (tb->owners == NULL) {
307                 if (tb->next)
308                         tb->next->pprev = tb->pprev;
309                 *(tb->pprev) = tb->next;
310                 kmem_cache_free(tcp_bucket_cachep, tb);
311         }
312         spin_unlock(&head->lock);
313 }
314
315 void tcp_put_port(struct sock *sk)
316 {
317         local_bh_disable();
318         __tcp_put_port(sk);
319         local_bh_enable();
320 }
321
322 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
323  * Look, when several writers sleep and reader wakes them up, all but one
324  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
325  * this, _but_ remember, it adds useless work on UP machines (wake up each
326  * exclusive lock release). It should be ifdefed really.
327  */
328
329 void tcp_listen_wlock(void)
330 {
331         write_lock(&tcp_lhash_lock);
332
333         if (atomic_read(&tcp_lhash_users)) {
334                 DECLARE_WAITQUEUE(wait, current);
335
336                 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
337                 for (;;) {
338                         set_current_state(TASK_UNINTERRUPTIBLE);
339                         if (atomic_read(&tcp_lhash_users) == 0)
340                                 break;
341                         write_unlock_bh(&tcp_lhash_lock);
342                         schedule();
343                         write_lock_bh(&tcp_lhash_lock);
344                 }
345
346                 __set_current_state(TASK_RUNNING);
347                 remove_wait_queue(&tcp_lhash_wait, &wait);
348         }
349 }
350
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
352 {
353         struct sock **skp;
354         rwlock_t *lock;
355
356         BUG_TRAP(sk->pprev==NULL);
357         if(listen_possible && sk->state == TCP_LISTEN) {
358                 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359                 lock = &tcp_lhash_lock;
360                 tcp_listen_wlock();
361         } else {
362                 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
363                 lock = &tcp_ehash[sk->hashent].lock;
364                 write_lock(lock);
365         }
366         if((sk->next = *skp) != NULL)
367                 (*skp)->pprev = &sk->next;
368         *skp = sk;
369         sk->pprev = skp;
370         sock_prot_inc_use(sk->prot);
371         write_unlock(lock);
372         if (listen_possible && sk->state == TCP_LISTEN)
373                 wake_up(&tcp_lhash_wait);
374 }
375
376 static void tcp_v4_hash(struct sock *sk)
377 {
378         if (sk->state != TCP_CLOSE) {
379                 local_bh_disable();
380                 __tcp_v4_hash(sk, 1);
381                 local_bh_enable();
382         }
383 }
384
385 void tcp_unhash(struct sock *sk)
386 {
387         rwlock_t *lock;
388
389         if (!sk->pprev)
390                 goto ende;
391
392         if (sk->state == TCP_LISTEN) {
393                 local_bh_disable();
394                 tcp_listen_wlock();
395                 lock = &tcp_lhash_lock;
396         } else {
397                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
398                 lock = &head->lock;
399                 write_lock_bh(&head->lock);
400         }
401
402         if(sk->pprev) {
403                 if(sk->next)
404                         sk->next->pprev = sk->pprev;
405                 *sk->pprev = sk->next;
406                 sk->pprev = NULL;
407                 sock_prot_dec_use(sk->prot);
408         }
409         write_unlock_bh(lock);
410
411  ende:
412         if (sk->state == TCP_LISTEN)
413                 wake_up(&tcp_lhash_wait);
414 }
415
416 /* Don't inline this cruft.  Here are some nice properties to
417  * exploit here.  The BSD API does not allow a listening TCP
418  * to specify the remote port nor the remote address for the
419  * connection.  So always assume those are both wildcarded
420  * during the search since they can never be otherwise.
421  */
422 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
423 {
424         struct sock *result = NULL;
425         int score, hiscore;
426
427         hiscore=-1;
428         for(; sk; sk = sk->next) {
429                 if(sk->num == hnum && !ipv6_only_sock(sk)) {
430                         __u32 rcv_saddr = sk->rcv_saddr;
431
432 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
433                         score = sk->family == PF_INET ? 1 : 0;
434 #else
435                         score = 1;
436 #endif
437                         if(rcv_saddr) {
438                                 if (rcv_saddr != daddr)
439                                         continue;
440                                 score+=2;
441                         }
442                         if (sk->bound_dev_if) {
443                                 if (sk->bound_dev_if != dif)
444                                         continue;
445                                 score+=2;
446                         }
447                         if (score == 5)
448                                 return sk;
449                         if (score > hiscore) {
450                                 hiscore = score;
451                                 result = sk;
452                         }
453                 }
454         }
455         return result;
456 }
457
458 /* Optimize the common listener case. */
459 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
460 {
461         struct sock *sk;
462
463         read_lock(&tcp_lhash_lock);
464         sk = tcp_listening_hash[tcp_lhashfn(hnum)];
465         if (sk) {
466                 if (sk->num == hnum &&
467                     sk->next == NULL &&
468                     (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
469                     (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
470                     !sk->bound_dev_if)
471                         goto sherry_cache;
472                 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
473         }
474         if (sk) {
475 sherry_cache:
476                 sock_hold(sk);
477         }
478         read_unlock(&tcp_lhash_lock);
479         return sk;
480 }
481
482 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
483  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
484  *
485  * Local BH must be disabled here.
486  */
487
488 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
489                                                        u32 daddr, u16 hnum, int dif)
490 {
491         struct tcp_ehash_bucket *head;
492         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
493         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
494         struct sock *sk;
495         int hash;
496
497         /* Optimize here for direct hit, only listening connections can
498          * have wildcards anyways.
499          */
500         hash = tcp_hashfn(daddr, hnum, saddr, sport);
501         head = &tcp_ehash[hash];
502         read_lock(&head->lock);
503         for(sk = head->chain; sk; sk = sk->next) {
504                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
505                         goto hit; /* You sunk my battleship! */
506         }
507
508         /* Must check for a TIME_WAIT'er before going to listener hash. */
509         for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
510                 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
511                         goto hit;
512         read_unlock(&head->lock);
513
514         return NULL;
515
516 hit:
517         sock_hold(sk);
518         read_unlock(&head->lock);
519         return sk;
520 }
521
522 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
523                                            u32 daddr, u16 hnum, int dif)
524 {
525         struct sock *sk;
526
527         sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
528
529         if (sk)
530                 return sk;
531                 
532         return tcp_v4_lookup_listener(daddr, hnum, dif);
533 }
534
535 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
536 {
537         struct sock *sk;
538
539         local_bh_disable();
540         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
541         local_bh_enable();
542
543         return sk;
544 }
545
546 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
547 {
548         return secure_tcp_sequence_number(skb->nh.iph->daddr,
549                                           skb->nh.iph->saddr,
550                                           skb->h.th->dest,
551                                           skb->h.th->source);
552 }
553
554 /* called with local bh disabled */
555 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
556                                       struct tcp_tw_bucket **twp)
557 {
558         u32 daddr = sk->rcv_saddr;
559         u32 saddr = sk->daddr;
560         int dif = sk->bound_dev_if;
561         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
562         __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
563         int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
564         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
565         struct sock *sk2, **skp;
566         struct tcp_tw_bucket *tw;
567
568         write_lock(&head->lock);
569
570         /* Check TIME-WAIT sockets first. */
571         for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
572             skp = &sk2->next) {
573                 tw = (struct tcp_tw_bucket*)sk2;
574
575                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
576                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
577
578                         /* With PAWS, it is safe from the viewpoint
579                            of data integrity. Even without PAWS it
580                            is safe provided sequence spaces do not
581                            overlap i.e. at data rates <= 80Mbit/sec.
582
583                            Actually, the idea is close to VJ's one,
584                            only timestamp cache is held not per host,
585                            but per port pair and TW bucket is used
586                            as state holder.
587
588                            If TW bucket has been already destroyed we
589                            fall back to VJ's scheme and use initial
590                            timestamp retrieved from peer table.
591                          */
592                         if (tw->ts_recent_stamp &&
593                             (!twp || (sysctl_tcp_tw_reuse &&
594                                       xtime.tv_sec - tw->ts_recent_stamp > 1))) {
595                                 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
596                                         tp->write_seq = 1;
597                                 tp->ts_recent = tw->ts_recent;
598                                 tp->ts_recent_stamp = tw->ts_recent_stamp;
599                                 sock_hold(sk2);
600                                 skp = &head->chain;
601                                 goto unique;
602                         } else
603                                 goto not_unique;
604                 }
605         }
606         tw = NULL;
607
608         /* And established part... */
609         for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
610                 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
611                         goto not_unique;
612         }
613
614 unique:
615         /* Must record num and sport now. Otherwise we will see
616          * in hash table socket with a funny identity. */
617         sk->num = lport;
618         sk->sport = htons(lport);
619         BUG_TRAP(sk->pprev==NULL);
620         if ((sk->next = *skp) != NULL)
621                 (*skp)->pprev = &sk->next;
622
623         *skp = sk;
624         sk->pprev = skp;
625         sk->hashent = hash;
626         sock_prot_inc_use(sk->prot);
627         write_unlock(&head->lock);
628
629         if (twp) {
630                 *twp = tw;
631                 NET_INC_STATS_BH(TimeWaitRecycled);
632         } else if (tw) {
633                 /* Silly. Should hash-dance instead... */
634                 tcp_tw_deschedule(tw);
635                 tcp_timewait_kill(tw);
636                 NET_INC_STATS_BH(TimeWaitRecycled);
637
638                 tcp_tw_put(tw);
639         }
640
641         return 0;
642
643 not_unique:
644         write_unlock(&head->lock);
645         return -EADDRNOTAVAIL;
646 }
647
648 /*
649  * Bind a port for a connect operation and hash it.
650  */
651 static int tcp_v4_hash_connect(struct sock *sk)
652 {
653         unsigned short snum = sk->num;
654         struct tcp_bind_hashbucket *head;
655         struct tcp_bind_bucket *tb;
656
657         if (snum == 0) {
658                 int rover;
659                 int low = sysctl_local_port_range[0];
660                 int high = sysctl_local_port_range[1];
661                 int remaining = (high - low) + 1;
662                 struct tcp_tw_bucket *tw = NULL;
663
664                 local_bh_disable();
665
666                 /* TODO. Actually it is not so bad idea to remove
667                  * tcp_portalloc_lock before next submission to Linus.
668                  * As soon as we touch this place at all it is time to think.
669                  *
670                  * Now it protects single _advisory_ variable tcp_port_rover,
671                  * hence it is mostly useless.
672                  * Code will work nicely if we just delete it, but
673                  * I am afraid in contented case it will work not better or
674                  * even worse: another cpu just will hit the same bucket
675                  * and spin there.
676                  * So some cpu salt could remove both contention and
677                  * memory pingpong. Any ideas how to do this in a nice way?
678                  */
679                 spin_lock(&tcp_portalloc_lock);
680                 rover = tcp_port_rover;
681
682                 do {
683                         rover++;
684                         if ((rover < low) || (rover > high))
685                                 rover = low;
686                         head = &tcp_bhash[tcp_bhashfn(rover)];
687                         spin_lock(&head->lock);         
688
689                         /* Does not bother with rcv_saddr checks,
690                          * because the established check is already
691                          * unique enough.
692                          */
693                         for (tb = head->chain; tb; tb = tb->next) {
694                                 if (tb->port == rover) {
695                                         BUG_TRAP(tb->owners != NULL);
696                                         if (tb->fastreuse >= 0)
697                                                 goto next_port;
698                                         if (!__tcp_v4_check_established(sk, rover, &tw))
699                                                 goto ok;
700                                         goto next_port;
701                                 }
702                         }
703
704                         tb = tcp_bucket_create(head, rover);
705                         if (!tb) {
706                                 spin_unlock(&head->lock);
707                                 break;
708                         }
709                         tb->fastreuse = -1;
710                         goto ok;
711
712                 next_port:
713                         spin_unlock(&head->lock);
714                 } while (--remaining > 0);
715                 tcp_port_rover = rover;
716                 spin_unlock(&tcp_portalloc_lock);
717
718                 local_bh_enable();
719
720                 return -EADDRNOTAVAIL;
721
722         ok:
723                 /* All locks still held and bhs disabled */
724                 tcp_port_rover = rover;
725                 spin_unlock(&tcp_portalloc_lock);
726
727                 tcp_bind_hash(sk, tb, rover);
728                 if (!sk->pprev) {
729                         sk->sport = htons(rover);
730                         __tcp_v4_hash(sk, 0);
731                 }
732                 spin_unlock(&head->lock);
733
734                 if (tw) {
735                         tcp_tw_deschedule(tw);
736                         tcp_timewait_kill(tw);
737                         tcp_tw_put(tw);
738                 }
739
740                 local_bh_enable();
741                 return 0;
742         }
743
744         head  = &tcp_bhash[tcp_bhashfn(snum)];
745         tb  = (struct tcp_bind_bucket *)sk->prev;
746         spin_lock_bh(&head->lock);
747         if (tb->owners == sk && sk->bind_next == NULL) {
748                 __tcp_v4_hash(sk, 0);
749                 spin_unlock_bh(&head->lock);
750                 return 0;
751         } else {
752                 int ret;
753                 spin_unlock(&head->lock);
754                 /* No definite answer... Walk to established hash table */
755                 ret = __tcp_v4_check_established(sk, snum, NULL);
756                 local_bh_enable();
757                 return ret;
758         }
759 }
760
761 /* This will initiate an outgoing connection. */
762 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
763 {
764         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
765         struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
766         struct rtable *rt;
767         u32 daddr, nexthop;
768         int tmp;
769         int err;
770
771         if (addr_len < sizeof(struct sockaddr_in))
772                 return(-EINVAL);
773
774         if (usin->sin_family != AF_INET)
775                 return(-EAFNOSUPPORT);
776
777         nexthop = daddr = usin->sin_addr.s_addr;
778         if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
779                 if (daddr == 0)
780                         return -EINVAL;
781                 nexthop = sk->protinfo.af_inet.opt->faddr;
782         }
783
784         tmp = ip_route_connect(&rt, nexthop, sk->saddr,
785                                RT_CONN_FLAGS(sk), sk->bound_dev_if);
786         if (tmp < 0)
787                 return tmp;
788
789         if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
790                 ip_rt_put(rt);
791                 return -ENETUNREACH;
792         }
793
794         __sk_dst_set(sk, &rt->u.dst);
795         sk->route_caps = rt->u.dst.dev->features;
796
797         if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
798                 daddr = rt->rt_dst;
799
800         if (!sk->saddr)
801                 sk->saddr = rt->rt_src;
802         sk->rcv_saddr = sk->saddr;
803
804         if (tp->ts_recent_stamp && sk->daddr != daddr) {
805                 /* Reset inherited state */
806                 tp->ts_recent = 0;
807                 tp->ts_recent_stamp = 0;
808                 tp->write_seq = 0;
809         }
810
811         if (sysctl_tcp_tw_recycle &&
812             !tp->ts_recent_stamp &&
813             rt->rt_dst == daddr) {
814                 struct inet_peer *peer = rt_get_peer(rt);
815
816                 /* VJ's idea. We save last timestamp seen from
817                  * the destination in peer table, when entering state TIME-WAIT
818                  * and initialize ts_recent from it, when trying new connection.
819                  */
820
821                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
822                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
823                         tp->ts_recent = peer->tcp_ts;
824                 }
825         }
826
827         sk->dport = usin->sin_port;
828         sk->daddr = daddr;
829
830         tp->ext_header_len = 0;
831         if (sk->protinfo.af_inet.opt)
832                 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
833
834         tp->mss_clamp = 536;
835
836         /* Socket identity is still unknown (sport may be zero).
837          * However we set state to SYN-SENT and not releasing socket
838          * lock select source port, enter ourselves into the hash tables and
839          * complete initalization after this.
840          */
841         tcp_set_state(sk, TCP_SYN_SENT);
842         err = tcp_v4_hash_connect(sk);
843         if (err)
844                 goto failure;
845
846         if (!tp->write_seq)
847                 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
848                                                            sk->sport, usin->sin_port);
849
850         sk->protinfo.af_inet.id = tp->write_seq^jiffies;
851
852         err = tcp_connect(sk);
853         if (err)
854                 goto failure;
855
856         return 0;
857
858 failure:
859         tcp_set_state(sk, TCP_CLOSE);
860         __sk_dst_reset(sk);
861         sk->route_caps = 0;
862         sk->dport = 0;
863         return err;
864 }
865
866 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
867 {
868         return ((struct rtable*)skb->dst)->rt_iif;
869 }
870
871 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
872 {
873         unsigned h = raddr ^ rport;
874         h ^= h>>16;
875         h ^= h>>8;
876         return h&(TCP_SYNQ_HSIZE-1);
877 }
878
879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 
880                                               struct open_request ***prevp,
881                                               __u16 rport,
882                                               __u32 raddr, __u32 laddr)
883 {
884         struct tcp_listen_opt *lopt = tp->listen_opt;
885         struct open_request *req, **prev;  
886
887         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
888              (req = *prev) != NULL;
889              prev = &req->dl_next) {
890                 if (req->rmt_port == rport &&
891                     req->af.v4_req.rmt_addr == raddr &&
892                     req->af.v4_req.loc_addr == laddr &&
893                     TCP_INET_FAMILY(req->class->family)) {
894                         BUG_TRAP(req->sk == NULL);
895                         *prevp = prev;
896                         return req; 
897                 }
898         }
899
900         return NULL;
901 }
902
903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
904 {
905         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906         struct tcp_listen_opt *lopt = tp->listen_opt;
907         unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
908
909         req->expires = jiffies + TCP_TIMEOUT_INIT;
910         req->retrans = 0;
911         req->sk = NULL;
912         req->dl_next = lopt->syn_table[h];
913
914         write_lock(&tp->syn_wait_lock);
915         lopt->syn_table[h] = req;
916         write_unlock(&tp->syn_wait_lock);
917
918         tcp_synq_added(sk);
919 }
920
921
922 /* 
923  * This routine does path mtu discovery as defined in RFC1191.
924  */
925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
926 {
927         struct dst_entry *dst;
928         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
929
930         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931          * send out by Linux are always <576bytes so they should go through
932          * unfragmented).
933          */
934         if (sk->state == TCP_LISTEN)
935                 return; 
936
937         /* We don't check in the destentry if pmtu discovery is forbidden
938          * on this route. We just assume that no packet_to_big packets
939          * are send back when pmtu discovery is not active.
940          * There is a small race when the user changes this flag in the
941          * route, but I think that's acceptable.
942          */
943         if ((dst = __sk_dst_check(sk, 0)) == NULL)
944                 return;
945
946         ip_rt_update_pmtu(dst, mtu);
947
948         /* Something is about to be wrong... Remember soft error
949          * for the case, if this connection will not able to recover.
950          */
951         if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952                 sk->err_soft = EMSGSIZE;
953
954         if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955             tp->pmtu_cookie > dst->pmtu) {
956                 tcp_sync_mss(sk, dst->pmtu);
957
958                 /* Resend the TCP packet because it's  
959                  * clear that the old packet has been
960                  * dropped. This is the new "fast" path mtu
961                  * discovery.
962                  */
963                 tcp_simple_retransmit(sk);
964         } /* else let the usual retransmit timer handle it */
965 }
966
967 /*
968  * This routine is called by the ICMP module when it gets some
969  * sort of error condition.  If err < 0 then the socket should
970  * be closed and the error returned to the user.  If err > 0
971  * it's just the icmp type << 8 | icmp code.  After adjustment
972  * header points to the first 8 bytes of the tcp header.  We need
973  * to find the appropriate port.
974  *
975  * The locking strategy used here is very "optimistic". When
976  * someone else accesses the socket the ICMP is just dropped
977  * and for some paths there is no check at all.
978  * A more general error queue to queue errors for later handling
979  * is probably better.
980  *
981  */
982
983 void tcp_v4_err(struct sk_buff *skb, u32 info)
984 {
985         struct iphdr *iph = (struct iphdr*)skb->data;
986         struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
987         struct tcp_opt *tp;
988         int type = skb->h.icmph->type;
989         int code = skb->h.icmph->code;
990         struct sock *sk;
991         __u32 seq;
992         int err;
993
994         if (skb->len < (iph->ihl << 2) + 8) {
995                 ICMP_INC_STATS_BH(IcmpInErrors); 
996                 return;
997         }
998
999         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1000         if (sk == NULL) {
1001                 ICMP_INC_STATS_BH(IcmpInErrors);
1002                 return;
1003         }
1004         if (sk->state == TCP_TIME_WAIT) {
1005                 tcp_tw_put((struct tcp_tw_bucket*)sk);
1006                 return;
1007         }
1008
1009         bh_lock_sock(sk);
1010         /* If too many ICMPs get dropped on busy
1011          * servers this needs to be solved differently.
1012          */
1013         if (sk->lock.users != 0)
1014                 NET_INC_STATS_BH(LockDroppedIcmps);
1015
1016         if (sk->state == TCP_CLOSE)
1017                 goto out;
1018
1019         tp = &sk->tp_pinfo.af_tcp;
1020         seq = ntohl(th->seq);
1021         if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022                 NET_INC_STATS(OutOfWindowIcmps);
1023                 goto out;
1024         }
1025
1026         switch (type) {
1027         case ICMP_SOURCE_QUENCH:
1028                 /* This is deprecated, but if someone generated it,
1029                  * we have no reasons to ignore it.
1030                  */
1031                 if (sk->lock.users == 0)
1032                         tcp_enter_cwr(tp);
1033                 goto out;
1034         case ICMP_PARAMETERPROB:
1035                 err = EPROTO;
1036                 break; 
1037         case ICMP_DEST_UNREACH:
1038                 if (code > NR_ICMP_UNREACH)
1039                         goto out;
1040
1041                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1042                         if (sk->lock.users == 0)
1043                                 do_pmtu_discovery(sk, iph, info);
1044                         goto out;
1045                 }
1046
1047                 err = icmp_err_convert[code].errno;
1048                 break;
1049         case ICMP_TIME_EXCEEDED:
1050                 err = EHOSTUNREACH;
1051                 break;
1052         default:
1053                 goto out;
1054         }
1055
1056         switch (sk->state) {
1057                 struct open_request *req, **prev;
1058         case TCP_LISTEN:
1059                 if (sk->lock.users != 0)
1060                         goto out;
1061
1062                 req = tcp_v4_search_req(tp, &prev,
1063                                         th->dest,
1064                                         iph->daddr, iph->saddr); 
1065                 if (!req)
1066                         goto out;
1067
1068                 /* ICMPs are not backlogged, hence we cannot get
1069                    an established socket here.
1070                  */
1071                 BUG_TRAP(req->sk == NULL);
1072
1073                 if (seq != req->snt_isn) {
1074                         NET_INC_STATS_BH(OutOfWindowIcmps);
1075                         goto out;
1076                 }
1077
1078                 /* 
1079                  * Still in SYN_RECV, just remove it silently.
1080                  * There is no good way to pass the error to the newly
1081                  * created socket, and POSIX does not want network
1082                  * errors returned from accept(). 
1083                  */ 
1084                 tcp_synq_drop(sk, req, prev);
1085                 goto out;
1086
1087         case TCP_SYN_SENT:
1088         case TCP_SYN_RECV:  /* Cannot happen.
1089                                It can f.e. if SYNs crossed.
1090                              */ 
1091                 if (sk->lock.users == 0) {
1092                         TCP_INC_STATS_BH(TcpAttemptFails);
1093                         sk->err = err;
1094
1095                         sk->error_report(sk);
1096
1097                         tcp_done(sk);
1098                 } else {
1099                         sk->err_soft = err;
1100                 }
1101                 goto out;
1102         }
1103
1104         /* If we've already connected we will keep trying
1105          * until we time out, or the user gives up.
1106          *
1107          * rfc1122 4.2.3.9 allows to consider as hard errors
1108          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1109          * but it is obsoleted by pmtu discovery).
1110          *
1111          * Note, that in modern internet, where routing is unreliable
1112          * and in each dark corner broken firewalls sit, sending random
1113          * errors ordered by their masters even this two messages finally lose
1114          * their original sense (even Linux sends invalid PORT_UNREACHs)
1115          *
1116          * Now we are in compliance with RFCs.
1117          *                                                      --ANK (980905)
1118          */
1119
1120         if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1121                 sk->err = err;
1122                 sk->error_report(sk);
1123         } else  { /* Only an error on timeout */
1124                 sk->err_soft = err;
1125         }
1126
1127 out:
1128         bh_unlock_sock(sk);
1129         sock_put(sk);
1130 }
1131
1132 /* This routine computes an IPv4 TCP checksum. */
1133 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 
1134                        struct sk_buff *skb)
1135 {
1136         if (skb->ip_summed == CHECKSUM_HW) {
1137                 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1138                 skb->csum = offsetof(struct tcphdr, check);
1139         } else {
1140                 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1141                                          csum_partial((char *)th, th->doff<<2, skb->csum));
1142         }
1143 }
1144
1145 /*
1146  *      This routine will send an RST to the other tcp.
1147  *
1148  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1149  *                    for reset.
1150  *      Answer: if a packet caused RST, it is not for a socket
1151  *              existing in our system, if it is matched to a socket,
1152  *              it is just duplicate segment or bug in other side's TCP.
1153  *              So that we build reply only basing on parameters
1154  *              arrived with segment.
1155  *      Exception: precedence violation. We do not implement it in any case.
1156  */
1157
1158 static void tcp_v4_send_reset(struct sk_buff *skb)
1159 {
1160         struct tcphdr *th = skb->h.th;
1161         struct tcphdr rth;
1162         struct ip_reply_arg arg;
1163
1164         /* Never send a reset in response to a reset. */
1165         if (th->rst)
1166                 return;
1167
1168         if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1169                 return;
1170
1171         /* Swap the send and the receive. */
1172         memset(&rth, 0, sizeof(struct tcphdr)); 
1173         rth.dest = th->source;
1174         rth.source = th->dest; 
1175         rth.doff = sizeof(struct tcphdr)/4;
1176         rth.rst = 1;
1177
1178         if (th->ack) {
1179                 rth.seq = th->ack_seq;
1180         } else {
1181                 rth.ack = 1;
1182                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1183                                     + skb->len - (th->doff<<2));
1184         }
1185
1186         memset(&arg, 0, sizeof arg); 
1187         arg.iov[0].iov_base = (unsigned char *)&rth; 
1188         arg.iov[0].iov_len  = sizeof rth;
1189         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 
1190                                       skb->nh.iph->saddr, /*XXX*/
1191                                       sizeof(struct tcphdr),
1192                                       IPPROTO_TCP,
1193                                       0); 
1194         arg.n_iov = 1;
1195         arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
1196
1197         tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1198         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1199
1200         TCP_INC_STATS_BH(TcpOutSegs);
1201         TCP_INC_STATS_BH(TcpOutRsts);
1202 }
1203
1204 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1205    outside socket context is ugly, certainly. What can I do?
1206  */
1207
1208 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1209 {
1210         struct tcphdr *th = skb->h.th;
1211         struct {
1212                 struct tcphdr th;
1213                 u32 tsopt[3];
1214         } rep;
1215         struct ip_reply_arg arg;
1216
1217         memset(&rep.th, 0, sizeof(struct tcphdr));
1218         memset(&arg, 0, sizeof arg);
1219
1220         arg.iov[0].iov_base = (unsigned char *)&rep; 
1221         arg.iov[0].iov_len  = sizeof(rep.th);
1222         arg.n_iov = 1;
1223         if (ts) {
1224                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1225                                      (TCPOPT_NOP << 16) |
1226                                      (TCPOPT_TIMESTAMP << 8) |
1227                                      TCPOLEN_TIMESTAMP);
1228                 rep.tsopt[1] = htonl(tcp_time_stamp);
1229                 rep.tsopt[2] = htonl(ts);
1230                 arg.iov[0].iov_len = sizeof(rep);
1231         }
1232
1233         /* Swap the send and the receive. */
1234         rep.th.dest = th->source;
1235         rep.th.source = th->dest; 
1236         rep.th.doff = arg.iov[0].iov_len/4;
1237         rep.th.seq = htonl(seq);
1238         rep.th.ack_seq = htonl(ack);
1239         rep.th.ack = 1;
1240         rep.th.window = htons(win);
1241
1242         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 
1243                                       skb->nh.iph->saddr, /*XXX*/
1244                                       arg.iov[0].iov_len,
1245                                       IPPROTO_TCP,
1246                                       0);
1247         arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
1248
1249         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1250
1251         TCP_INC_STATS_BH(TcpOutSegs);
1252 }
1253
1254 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1255 {
1256         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1257
1258         tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1259                         tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1260
1261         tcp_tw_put(tw);
1262 }
1263
1264 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1265 {
1266         tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1267                         req->ts_recent);
1268 }
1269
1270 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1271 {
1272         struct rtable *rt;
1273         struct ip_options *opt;
1274
1275         opt = req->af.v4_req.opt;
1276         if(ip_route_output(&rt, ((opt && opt->srr) ?
1277                                  opt->faddr :
1278                                  req->af.v4_req.rmt_addr),
1279                            req->af.v4_req.loc_addr,
1280                            RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1281                 IP_INC_STATS_BH(IpOutNoRoutes);
1282                 return NULL;
1283         }
1284         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1285                 ip_rt_put(rt);
1286                 IP_INC_STATS_BH(IpOutNoRoutes);
1287                 return NULL;
1288         }
1289         return &rt->u.dst;
1290 }
1291
1292 /*
1293  *      Send a SYN-ACK after having received an ACK. 
1294  *      This still operates on a open_request only, not on a big
1295  *      socket.
1296  */ 
1297 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1298                               struct dst_entry *dst)
1299 {
1300         int err = -1;
1301         struct sk_buff * skb;
1302
1303         /* First, grab a route. */
1304         if (dst == NULL &&
1305             (dst = tcp_v4_route_req(sk, req)) == NULL)
1306                 goto out;
1307
1308         skb = tcp_make_synack(sk, dst, req);
1309
1310         if (skb) {
1311                 struct tcphdr *th = skb->h.th;
1312
1313                 th->check = tcp_v4_check(th, skb->len,
1314                                          req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1315                                          csum_partial((char *)th, skb->len, skb->csum));
1316
1317                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1318                                             req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1319                 if (err == NET_XMIT_CN)
1320                         err = 0;
1321         }
1322
1323 out:
1324         dst_release(dst);
1325         return err;
1326 }
1327
1328 /*
1329  *      IPv4 open_request destructor.
1330  */ 
1331 static void tcp_v4_or_free(struct open_request *req)
1332 {
1333         if (req->af.v4_req.opt)
1334                 kfree(req->af.v4_req.opt);
1335 }
1336
1337 static inline void syn_flood_warning(struct sk_buff *skb)
1338 {
1339         static unsigned long warntime;
1340         
1341         if (jiffies - warntime > HZ*60) {
1342                 warntime = jiffies;
1343                 printk(KERN_INFO 
1344                        "possible SYN flooding on port %d. Sending cookies.\n",  
1345                        ntohs(skb->h.th->dest));
1346         }
1347 }
1348
1349 /* 
1350  * Save and compile IPv4 options into the open_request if needed. 
1351  */
1352 static inline struct ip_options * 
1353 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1354 {
1355         struct ip_options *opt = &(IPCB(skb)->opt);
1356         struct ip_options *dopt = NULL; 
1357
1358         if (opt && opt->optlen) {
1359                 int opt_size = optlength(opt); 
1360                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1361                 if (dopt) {
1362                         if (ip_options_echo(dopt, skb)) {
1363                                 kfree(dopt);
1364                                 dopt = NULL;
1365                         }
1366                 }
1367         }
1368         return dopt;
1369 }
1370
1371 /* 
1372  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1373  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1374  * It would be better to replace it with a global counter for all sockets
1375  * but then some measure against one socket starving all other sockets
1376  * would be needed.
1377  *
1378  * It was 128 by default. Experiments with real servers show, that
1379  * it is absolutely not enough even at 100conn/sec. 256 cures most
1380  * of problems. This value is adjusted to 128 for very small machines
1381  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1382  * Further increasing requires to change hash table size.
1383  */
1384 int sysctl_max_syn_backlog = 256; 
1385
1386 struct or_calltable or_ipv4 = {
1387         PF_INET,
1388         tcp_v4_send_synack,
1389         tcp_v4_or_send_ack,
1390         tcp_v4_or_free,
1391         tcp_v4_send_reset
1392 };
1393
1394 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1395 {
1396         struct tcp_opt tp;
1397         struct open_request *req;
1398         __u32 saddr = skb->nh.iph->saddr;
1399         __u32 daddr = skb->nh.iph->daddr;
1400         __u32 isn = TCP_SKB_CB(skb)->when;
1401         struct dst_entry *dst = NULL;
1402 #ifdef CONFIG_SYN_COOKIES
1403         int want_cookie = 0;
1404 #else
1405 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1406 #endif
1407
1408         /* Never answer to SYNs send to broadcast or multicast */
1409         if (((struct rtable *)skb->dst)->rt_flags & 
1410             (RTCF_BROADCAST|RTCF_MULTICAST))
1411                 goto drop; 
1412
1413         /* TW buckets are converted to open requests without
1414          * limitations, they conserve resources and peer is
1415          * evidently real one.
1416          */
1417         if (tcp_synq_is_full(sk) && !isn) {
1418 #ifdef CONFIG_SYN_COOKIES
1419                 if (sysctl_tcp_syncookies) {
1420                         want_cookie = 1; 
1421                 } else
1422 #endif
1423                 goto drop;
1424         }
1425
1426         /* Accept backlog is full. If we have already queued enough
1427          * of warm entries in syn queue, drop request. It is better than
1428          * clogging syn queue with openreqs with exponentially increasing
1429          * timeout.
1430          */
1431         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1432                 goto drop;
1433
1434         req = tcp_openreq_alloc();
1435         if (req == NULL)
1436                 goto drop;
1437
1438         tcp_clear_options(&tp);
1439         tp.mss_clamp = 536;
1440         tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1441
1442         tcp_parse_options(skb, &tp, 0);
1443
1444         if (want_cookie) {
1445                 tcp_clear_options(&tp);
1446                 tp.saw_tstamp = 0;
1447         }
1448
1449         if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1450                 /* Some OSes (unknown ones, but I see them on web server, which
1451                  * contains information interesting only for windows'
1452                  * users) do not send their stamp in SYN. It is easy case.
1453                  * We simply do not advertise TS support.
1454                  */
1455                 tp.saw_tstamp = 0;
1456                 tp.tstamp_ok = 0;
1457         }
1458         tp.tstamp_ok = tp.saw_tstamp;
1459
1460         tcp_openreq_init(req, &tp, skb);
1461
1462         req->af.v4_req.loc_addr = daddr;
1463         req->af.v4_req.rmt_addr = saddr;
1464         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1465         req->class = &or_ipv4;
1466         if (!want_cookie)
1467                 TCP_ECN_create_request(req, skb->h.th);
1468
1469         if (want_cookie) {
1470 #ifdef CONFIG_SYN_COOKIES
1471                 syn_flood_warning(skb);
1472 #endif
1473                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1474         } else if (isn == 0) {
1475                 struct inet_peer *peer = NULL;
1476
1477                 /* VJ's idea. We save last timestamp seen
1478                  * from the destination in peer table, when entering
1479                  * state TIME-WAIT, and check against it before
1480                  * accepting new connection request.
1481                  *
1482                  * If "isn" is not zero, this request hit alive
1483                  * timewait bucket, so that all the necessary checks
1484                  * are made in the function processing timewait state.
1485                  */
1486                 if (tp.saw_tstamp &&
1487                     sysctl_tcp_tw_recycle &&
1488                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1489                     (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1490                     peer->v4daddr == saddr) {
1491                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1492                             (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1493                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1494                                 dst_release(dst);
1495                                 goto drop_and_free;
1496                         }
1497                 }
1498                 /* Kill the following clause, if you dislike this way. */
1499                 else if (!sysctl_tcp_syncookies &&
1500                          (sysctl_max_syn_backlog - tcp_synq_len(sk)
1501                           < (sysctl_max_syn_backlog>>2)) &&
1502                          (!peer || !peer->tcp_ts_stamp) &&
1503                          (!dst || !dst->rtt)) {
1504                         /* Without syncookies last quarter of
1505                          * backlog is filled with destinations, proven to be alive.
1506                          * It means that we continue to communicate
1507                          * to destinations, already remembered
1508                          * to the moment of synflood.
1509                          */
1510                         NETDEBUG(if (net_ratelimit()) \
1511                                 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1512                                         NIPQUAD(saddr), ntohs(skb->h.th->source)));
1513                         dst_release(dst);
1514                         goto drop_and_free;
1515                 }
1516
1517                 isn = tcp_v4_init_sequence(sk, skb);
1518         }
1519         req->snt_isn = isn;
1520
1521         if (tcp_v4_send_synack(sk, req, dst))
1522                 goto drop_and_free;
1523
1524         if (want_cookie) {
1525                 tcp_openreq_free(req); 
1526         } else {
1527                 tcp_v4_synq_add(sk, req);
1528         }
1529         return 0;
1530
1531 drop_and_free:
1532         tcp_openreq_free(req); 
1533 drop:
1534         TCP_INC_STATS_BH(TcpAttemptFails);
1535         return 0;
1536 }
1537
1538
1539 /* 
1540  * The three way handshake has completed - we got a valid synack - 
1541  * now create the new socket. 
1542  */
1543 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1544                                    struct open_request *req,
1545                                    struct dst_entry *dst)
1546 {
1547         struct tcp_opt *newtp;
1548         struct sock *newsk;
1549
1550         if (tcp_acceptq_is_full(sk))
1551                 goto exit_overflow;
1552
1553         if (dst == NULL &&
1554             (dst = tcp_v4_route_req(sk, req)) == NULL)
1555                 goto exit;
1556
1557         newsk = tcp_create_openreq_child(sk, req, skb);
1558         if (!newsk)
1559                 goto exit;
1560
1561         newsk->dst_cache = dst;
1562         newsk->route_caps = dst->dev->features;
1563
1564         newtp = &(newsk->tp_pinfo.af_tcp);
1565         newsk->daddr = req->af.v4_req.rmt_addr;
1566         newsk->saddr = req->af.v4_req.loc_addr;
1567         newsk->rcv_saddr = req->af.v4_req.loc_addr;
1568         newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1569         req->af.v4_req.opt = NULL;
1570         newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1571         newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1572         newtp->ext_header_len = 0;
1573         if (newsk->protinfo.af_inet.opt)
1574                 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1575         newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1576
1577         tcp_sync_mss(newsk, dst->pmtu);
1578         newtp->advmss = dst->advmss;
1579         tcp_initialize_rcv_mss(newsk);
1580
1581         __tcp_v4_hash(newsk, 0);
1582         __tcp_inherit_port(sk, newsk);
1583
1584         return newsk;
1585
1586 exit_overflow:
1587         NET_INC_STATS_BH(ListenOverflows);
1588 exit:
1589         NET_INC_STATS_BH(ListenDrops);
1590         dst_release(dst);
1591         return NULL;
1592 }
1593
1594 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1595 {
1596         struct open_request *req, **prev;
1597         struct tcphdr *th = skb->h.th;
1598         struct iphdr *iph = skb->nh.iph;
1599         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1600         struct sock *nsk;
1601
1602         /* Find possible connection requests. */
1603         req = tcp_v4_search_req(tp, &prev,
1604                                 th->source,
1605                                 iph->saddr, iph->daddr);
1606         if (req)
1607                 return tcp_check_req(sk, skb, req, prev);
1608
1609         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1610                                           th->source,
1611                                           skb->nh.iph->daddr,
1612                                           ntohs(th->dest),
1613                                           tcp_v4_iif(skb));
1614
1615         if (nsk) {
1616                 if (nsk->state != TCP_TIME_WAIT) {
1617                         bh_lock_sock(nsk);
1618                         return nsk;
1619                 }
1620                 tcp_tw_put((struct tcp_tw_bucket*)nsk);
1621                 return NULL;
1622         }
1623
1624 #ifdef CONFIG_SYN_COOKIES
1625         if (!th->rst && !th->syn && th->ack)
1626                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1627 #endif
1628         return sk;
1629 }
1630
1631 static int tcp_v4_checksum_init(struct sk_buff *skb)
1632 {
1633         if (skb->ip_summed == CHECKSUM_HW) {
1634                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1635                 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1636                                   skb->nh.iph->daddr,skb->csum))
1637                         return 0;
1638
1639                 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1640                 skb->ip_summed = CHECKSUM_NONE;
1641         }
1642         if (skb->len <= 76) {
1643                 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1644                                  skb->nh.iph->daddr,
1645                                  skb_checksum(skb, 0, skb->len, 0)))
1646                         return -1;
1647                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1648         } else {
1649                 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1650                                           skb->nh.iph->daddr,0);
1651         }
1652         return 0;
1653 }
1654
1655
1656 /* The socket must have it's spinlock held when we get
1657  * here.
1658  *
1659  * We have a potential double-lock case here, so even when
1660  * doing backlog processing we use the BH locking scheme.
1661  * This is because we cannot sleep with the original spinlock
1662  * held.
1663  */
1664 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1665 {
1666 #ifdef CONFIG_FILTER
1667         struct sk_filter *filter = sk->filter;
1668         if (filter && sk_filter(skb, filter))
1669                 goto discard;
1670 #endif /* CONFIG_FILTER */
1671
1672         IP_INC_STATS_BH(IpInDelivers);
1673
1674         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1675                 TCP_CHECK_TIMER(sk);
1676                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1677                         goto reset;
1678                 TCP_CHECK_TIMER(sk);
1679                 return 0; 
1680         }
1681
1682         if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1683                 goto csum_err;
1684
1685         if (sk->state == TCP_LISTEN) { 
1686                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1687                 if (!nsk)
1688                         goto discard;
1689
1690                 if (nsk != sk) {
1691                         if (tcp_child_process(sk, nsk, skb))
1692                                 goto reset;
1693                         return 0;
1694                 }
1695         }
1696
1697         TCP_CHECK_TIMER(sk);
1698         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1699                 goto reset;
1700         TCP_CHECK_TIMER(sk);
1701         return 0;
1702
1703 reset:
1704         tcp_v4_send_reset(skb);
1705 discard:
1706         kfree_skb(skb);
1707         /* Be careful here. If this function gets more complicated and
1708          * gcc suffers from register pressure on the x86, sk (in %ebx) 
1709          * might be destroyed here. This current version compiles correctly,
1710          * but you have been warned.
1711          */
1712         return 0;
1713
1714 csum_err:
1715         TCP_INC_STATS_BH(TcpInErrs);
1716         goto discard;
1717 }
1718
1719 /*
1720  *      From tcp_input.c
1721  */
1722
1723 int tcp_v4_rcv(struct sk_buff *skb)
1724 {
1725         struct tcphdr *th;
1726         struct sock *sk;
1727         int ret;
1728
1729         if (skb->pkt_type!=PACKET_HOST)
1730                 goto discard_it;
1731
1732         /* Count it even if it's bad */
1733         TCP_INC_STATS_BH(TcpInSegs);
1734
1735         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1736                 goto discard_it;
1737
1738         th = skb->h.th;
1739
1740         if (th->doff < sizeof(struct tcphdr)/4)
1741                 goto bad_packet;
1742         if (!pskb_may_pull(skb, th->doff*4))
1743                 goto discard_it;
1744
1745         /* An explanation is required here, I think.
1746          * Packet length and doff are validated by header prediction,
1747          * provided case of th->doff==0 is elimineted.
1748          * So, we defer the checks. */
1749         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1750              tcp_v4_checksum_init(skb) < 0))
1751                 goto bad_packet;
1752
1753         th = skb->h.th;
1754         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1755         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1756                                     skb->len - th->doff*4);
1757         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1758         TCP_SKB_CB(skb)->when = 0;
1759         TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1760         TCP_SKB_CB(skb)->sacked = 0;
1761
1762         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1763                              skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1764
1765         if (!sk)
1766                 goto no_tcp_socket;
1767
1768 process:
1769         if(!ipsec_sk_policy(sk,skb))
1770                 goto discard_and_relse;
1771
1772         if (sk->state == TCP_TIME_WAIT)
1773                 goto do_time_wait;
1774
1775         skb->dev = NULL;
1776
1777         bh_lock_sock(sk);
1778         ret = 0;
1779         if (!sk->lock.users) {
1780                 if (!tcp_prequeue(sk, skb))
1781                         ret = tcp_v4_do_rcv(sk, skb);
1782         } else
1783                 sk_add_backlog(sk, skb);
1784         bh_unlock_sock(sk);
1785
1786         sock_put(sk);
1787
1788         return ret;
1789
1790 no_tcp_socket:
1791         if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1792 bad_packet:
1793                 TCP_INC_STATS_BH(TcpInErrs);
1794         } else {
1795                 tcp_v4_send_reset(skb);
1796         }
1797
1798 discard_it:
1799         /* Discard frame. */
1800         kfree_skb(skb);
1801         return 0;
1802
1803 discard_and_relse:
1804         sock_put(sk);
1805         goto discard_it;
1806
1807 do_time_wait:
1808         if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1809                 TCP_INC_STATS_BH(TcpInErrs);
1810                 goto discard_and_relse;
1811         }
1812         switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1813                                           skb, th, skb->len)) {
1814         case TCP_TW_SYN:
1815         {
1816                 struct sock *sk2;
1817
1818                 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1819                 if (sk2 != NULL) {
1820                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1821                         tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1822                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1823                         sk = sk2;
1824                         goto process;
1825                 }
1826                 /* Fall through to ACK */
1827         }
1828         case TCP_TW_ACK:
1829                 tcp_v4_timewait_ack(sk, skb);
1830                 break;
1831         case TCP_TW_RST:
1832                 goto no_tcp_socket;
1833         case TCP_TW_SUCCESS:;
1834         }
1835         goto discard_it;
1836 }
1837
1838 /* With per-bucket locks this operation is not-atomic, so that
1839  * this version is not worse.
1840  */
1841 static void __tcp_v4_rehash(struct sock *sk)
1842 {
1843         sk->prot->unhash(sk);
1844         sk->prot->hash(sk);
1845 }
1846
1847 static int tcp_v4_reselect_saddr(struct sock *sk)
1848 {
1849         int err;
1850         struct rtable *rt;
1851         __u32 old_saddr = sk->saddr;
1852         __u32 new_saddr;
1853         __u32 daddr = sk->daddr;
1854
1855         if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1856                 daddr = sk->protinfo.af_inet.opt->faddr;
1857
1858         /* Query new route. */
1859         err = ip_route_connect(&rt, daddr, 0,
1860                                RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1861                                sk->bound_dev_if);
1862         if (err)
1863                 return err;
1864
1865         __sk_dst_set(sk, &rt->u.dst);
1866         sk->route_caps = rt->u.dst.dev->features;
1867
1868         new_saddr = rt->rt_src;
1869
1870         if (new_saddr == old_saddr)
1871                 return 0;
1872
1873         if (sysctl_ip_dynaddr > 1) {
1874                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1875                        "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1876                        NIPQUAD(old_saddr), 
1877                        NIPQUAD(new_saddr));
1878         }
1879
1880         sk->saddr = new_saddr;
1881         sk->rcv_saddr = new_saddr;
1882
1883         /* XXX The only one ugly spot where we need to
1884          * XXX really change the sockets identity after
1885          * XXX it has entered the hashes. -DaveM
1886          *
1887          * Besides that, it does not check for connection
1888          * uniqueness. Wait for troubles.
1889          */
1890         __tcp_v4_rehash(sk);
1891         return 0;
1892 }
1893
1894 int tcp_v4_rebuild_header(struct sock *sk)
1895 {
1896         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1897         u32 daddr;
1898         int err;
1899
1900         /* Route is OK, nothing to do. */
1901         if (rt != NULL)
1902                 return 0;
1903
1904         /* Reroute. */
1905         daddr = sk->daddr;
1906         if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1907                 daddr = sk->protinfo.af_inet.opt->faddr;
1908
1909         err = ip_route_output(&rt, daddr, sk->saddr,
1910                               RT_CONN_FLAGS(sk), sk->bound_dev_if);
1911         if (!err) {
1912                 __sk_dst_set(sk, &rt->u.dst);
1913                 sk->route_caps = rt->u.dst.dev->features;
1914                 return 0;
1915         }
1916
1917         /* Routing failed... */
1918         sk->route_caps = 0;
1919
1920         if (!sysctl_ip_dynaddr ||
1921             sk->state != TCP_SYN_SENT ||
1922             (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1923             (err = tcp_v4_reselect_saddr(sk)) != 0)
1924                 sk->err_soft=-err;
1925
1926         return err;
1927 }
1928
1929 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1930 {
1931         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1932
1933         sin->sin_family         = AF_INET;
1934         sin->sin_addr.s_addr    = sk->daddr;
1935         sin->sin_port           = sk->dport;
1936 }
1937
1938 /* VJ's idea. Save last timestamp seen from this destination
1939  * and hold it at least for normal timewait interval to use for duplicate
1940  * segment detection in subsequent connections, before they enter synchronized
1941  * state.
1942  */
1943
1944 int tcp_v4_remember_stamp(struct sock *sk)
1945 {
1946         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1947         struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1948         struct inet_peer *peer = NULL;
1949         int release_it = 0;
1950
1951         if (rt == NULL || rt->rt_dst != sk->daddr) {
1952                 peer = inet_getpeer(sk->daddr, 1);
1953                 release_it = 1;
1954         } else {
1955                 if (rt->peer == NULL)
1956                         rt_bind_peer(rt, 1);
1957                 peer = rt->peer;
1958         }
1959
1960         if (peer) {
1961                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1962                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1963                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1964                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
1965                         peer->tcp_ts = tp->ts_recent;
1966                 }
1967                 if (release_it)
1968                         inet_putpeer(peer);
1969                 return 1;
1970         }
1971
1972         return 0;
1973 }
1974
1975 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1976 {
1977         struct inet_peer *peer = NULL;
1978
1979         peer = inet_getpeer(tw->daddr, 1);
1980
1981         if (peer) {
1982                 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1983                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1984                      peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1985                         peer->tcp_ts_stamp = tw->ts_recent_stamp;
1986                         peer->tcp_ts = tw->ts_recent;
1987                 }
1988                 inet_putpeer(peer);
1989                 return 1;
1990         }
1991
1992         return 0;
1993 }
1994
1995 struct tcp_func ipv4_specific = {
1996         ip_queue_xmit,
1997         tcp_v4_send_check,
1998         tcp_v4_rebuild_header,
1999         tcp_v4_conn_request,
2000         tcp_v4_syn_recv_sock,
2001         tcp_v4_remember_stamp,
2002         sizeof(struct iphdr),
2003
2004         ip_setsockopt,
2005         ip_getsockopt,
2006         v4_addr2sockaddr,
2007         sizeof(struct sockaddr_in)
2008 };
2009
2010 /* NOTE: A lot of things set to zero explicitly by call to
2011  *       sk_alloc() so need not be done here.
2012  */
2013 static int tcp_v4_init_sock(struct sock *sk)
2014 {
2015         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2016
2017         skb_queue_head_init(&tp->out_of_order_queue);
2018         tcp_init_xmit_timers(sk);
2019         tcp_prequeue_init(tp);
2020
2021         tp->rto  = TCP_TIMEOUT_INIT;
2022         tp->mdev = TCP_TIMEOUT_INIT;
2023       
2024         /* So many TCP implementations out there (incorrectly) count the
2025          * initial SYN frame in their delayed-ACK and congestion control
2026          * algorithms that we must have the following bandaid to talk
2027          * efficiently to them.  -DaveM
2028          */
2029         tp->snd_cwnd = 2;
2030
2031         /* See draft-stevens-tcpca-spec-01 for discussion of the
2032          * initialization of these values.
2033          */
2034         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2035         tp->snd_cwnd_clamp = ~0;
2036         tp->mss_cache = 536;
2037
2038         tp->reordering = sysctl_tcp_reordering;
2039
2040         sk->state = TCP_CLOSE;
2041
2042         sk->write_space = tcp_write_space;
2043         sk->use_write_queue = 1;
2044
2045         sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2046
2047         sk->sndbuf = sysctl_tcp_wmem[1];
2048         sk->rcvbuf = sysctl_tcp_rmem[1];
2049
2050         atomic_inc(&tcp_sockets_allocated);
2051
2052         return 0;
2053 }
2054
2055 static int tcp_v4_destroy_sock(struct sock *sk)
2056 {
2057         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2058
2059         tcp_clear_xmit_timers(sk);
2060
2061         /* Cleanup up the write buffer. */
2062         tcp_writequeue_purge(sk);
2063
2064         /* Cleans up our, hopefully empty, out_of_order_queue. */
2065         __skb_queue_purge(&tp->out_of_order_queue);
2066
2067         /* Clean prequeue, it must be empty really */
2068         __skb_queue_purge(&tp->ucopy.prequeue);
2069
2070         /* Clean up a referenced TCP bind bucket. */
2071         if(sk->prev != NULL)
2072                 tcp_put_port(sk);
2073
2074         /* If sendmsg cached page exists, toss it. */
2075         if (tp->sndmsg_page != NULL)
2076                 __free_page(tp->sndmsg_page);
2077
2078         atomic_dec(&tcp_sockets_allocated);
2079
2080         return 0;
2081 }
2082
2083 /* Proc filesystem TCP sock list dumping. */
2084 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2085 {
2086         int ttd = req->expires - jiffies;
2087
2088         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2089                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2090                 i,
2091                 req->af.v4_req.loc_addr,
2092                 ntohs(sk->sport),
2093                 req->af.v4_req.rmt_addr,
2094                 ntohs(req->rmt_port),
2095                 TCP_SYN_RECV,
2096                 0,0, /* could print option size, but that is af dependent. */
2097                 1,   /* timers active (only the expire timer) */  
2098                 ttd, 
2099                 req->retrans,
2100                 uid,
2101                 0,  /* non standard timer */  
2102                 0, /* open_requests have no inode */
2103                 atomic_read(&sk->refcnt),
2104                 req
2105                 ); 
2106 }
2107
2108 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2109 {
2110         unsigned int dest, src;
2111         __u16 destp, srcp;
2112         int timer_active;
2113         unsigned long timer_expires;
2114         struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2115
2116         dest  = sp->daddr;
2117         src   = sp->rcv_saddr;
2118         destp = ntohs(sp->dport);
2119         srcp  = ntohs(sp->sport);
2120         if (tp->pending == TCP_TIME_RETRANS) {
2121                 timer_active    = 1;
2122                 timer_expires   = tp->timeout;
2123         } else if (tp->pending == TCP_TIME_PROBE0) {
2124                 timer_active    = 4;
2125                 timer_expires   = tp->timeout;
2126         } else if (timer_pending(&sp->timer)) {
2127                 timer_active    = 2;
2128                 timer_expires   = sp->timer.expires;
2129         } else {
2130                 timer_active    = 0;
2131                 timer_expires = jiffies;
2132         }
2133
2134         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2135                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2136                 i, src, srcp, dest, destp, sp->state, 
2137                 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2138                 timer_active, timer_expires-jiffies,
2139                 tp->retransmits,
2140                 sock_i_uid(sp),
2141                 tp->probes_out,
2142                 sock_i_ino(sp),
2143                 atomic_read(&sp->refcnt), sp,
2144                 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2145                 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2146                 );
2147 }
2148
2149 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2150 {
2151         unsigned int dest, src;
2152         __u16 destp, srcp;
2153         int ttd = tw->ttd - jiffies;
2154
2155         if (ttd < 0)
2156                 ttd = 0;
2157
2158         dest  = tw->daddr;
2159         src   = tw->rcv_saddr;
2160         destp = ntohs(tw->dport);
2161         srcp  = ntohs(tw->sport);
2162
2163         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2164                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2165                 i, src, srcp, dest, destp, tw->substate, 0, 0,
2166                 3, ttd, 0, 0, 0, 0,
2167                 atomic_read(&tw->refcnt), tw);
2168 }
2169
2170 #define TMPSZ 150
2171
2172 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2173 {
2174         int len = 0, num = 0, i;
2175         off_t begin, pos = 0;
2176         char tmpbuf[TMPSZ+1];
2177
2178         if (offset < TMPSZ)
2179                 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2180                                "  sl  local_address rem_address   st tx_queue "
2181                                "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2182
2183         pos = TMPSZ;
2184
2185         /* First, walk listening socket table. */
2186         tcp_listen_lock();
2187         for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2188                 struct sock *sk;
2189                 struct tcp_listen_opt *lopt;
2190                 int k;
2191
2192                 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2193                         struct open_request *req;
2194                         int uid;
2195                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2196
2197                         if (!TCP_INET_FAMILY(sk->family))
2198                                 goto skip_listen;
2199
2200                         pos += TMPSZ;
2201                         if (pos >= offset) {
2202                                 get_tcp_sock(sk, tmpbuf, num);
2203                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2204                                 if (pos >= offset + length) {
2205                                         tcp_listen_unlock();
2206                                         goto out_no_bh;
2207                                 }
2208                         }
2209
2210 skip_listen:
2211                         uid = sock_i_uid(sk);
2212                         read_lock_bh(&tp->syn_wait_lock);
2213                         lopt = tp->listen_opt;
2214                         if (lopt && lopt->qlen != 0) {
2215                                 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2216                                         for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2217                                                 if (!TCP_INET_FAMILY(req->class->family))
2218                                                         continue;
2219
2220                                                 pos += TMPSZ;
2221                                                 if (pos <= offset)
2222                                                         continue;
2223                                                 get_openreq(sk, req, tmpbuf, num, uid);
2224                                                 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2225                                                 if (pos >= offset + length) {
2226                                                         read_unlock_bh(&tp->syn_wait_lock);
2227                                                         tcp_listen_unlock();
2228                                                         goto out_no_bh;
2229                                                 }
2230                                         }
2231                                 }
2232                         }
2233                         read_unlock_bh(&tp->syn_wait_lock);
2234
2235                         /* Completed requests are in normal socket hash table */
2236                 }
2237         }
2238         tcp_listen_unlock();
2239
2240         local_bh_disable();
2241
2242         /* Next, walk established hash chain. */
2243         for (i = 0; i < tcp_ehash_size; i++) {
2244                 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2245                 struct sock *sk;
2246                 struct tcp_tw_bucket *tw;
2247
2248                 read_lock(&head->lock);
2249                 for(sk = head->chain; sk; sk = sk->next, num++) {
2250                         if (!TCP_INET_FAMILY(sk->family))
2251                                 continue;
2252                         pos += TMPSZ;
2253                         if (pos <= offset)
2254                                 continue;
2255                         get_tcp_sock(sk, tmpbuf, num);
2256                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2257                         if (pos >= offset + length) {
2258                                 read_unlock(&head->lock);
2259                                 goto out;
2260                         }
2261                 }
2262                 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2263                      tw != NULL;
2264                      tw = (struct tcp_tw_bucket *)tw->next, num++) {
2265                         if (!TCP_INET_FAMILY(tw->family))
2266                                 continue;
2267                         pos += TMPSZ;
2268                         if (pos <= offset)
2269                                 continue;
2270                         get_timewait_sock(tw, tmpbuf, num);
2271                         len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2272                         if (pos >= offset + length) {
2273                                 read_unlock(&head->lock);
2274                                 goto out;
2275                         }
2276                 }
2277                 read_unlock(&head->lock);
2278         }
2279
2280 out:
2281         local_bh_enable();
2282 out_no_bh:
2283
2284         begin = len - (pos - offset);
2285         *start = buffer + begin;
2286         len -= begin;
2287         if (len > length)
2288                 len = length;
2289         if (len < 0)
2290                 len = 0; 
2291         return len;
2292 }
2293
2294 struct proto tcp_prot = {
2295         name:           "TCP",
2296         close:          tcp_close,
2297         connect:        tcp_v4_connect,
2298         disconnect:     tcp_disconnect,
2299         accept:         tcp_accept,
2300         ioctl:          tcp_ioctl,
2301         init:           tcp_v4_init_sock,
2302         destroy:        tcp_v4_destroy_sock,
2303         shutdown:       tcp_shutdown,
2304         setsockopt:     tcp_setsockopt,
2305         getsockopt:     tcp_getsockopt,
2306         sendmsg:        tcp_sendmsg,
2307         recvmsg:        tcp_recvmsg,
2308         backlog_rcv:    tcp_v4_do_rcv,
2309         hash:           tcp_v4_hash,
2310         unhash:         tcp_unhash,
2311         get_port:       tcp_v4_get_port,
2312 };
2313
2314
2315
2316 void __init tcp_v4_init(struct net_proto_family *ops)
2317 {
2318         int err;
2319
2320         tcp_inode.i_mode = S_IFSOCK;
2321         tcp_inode.i_sock = 1;
2322         tcp_inode.i_uid = 0;
2323         tcp_inode.i_gid = 0;
2324         init_waitqueue_head(&tcp_inode.i_wait);
2325         init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2326
2327         tcp_socket->inode = &tcp_inode;
2328         tcp_socket->state = SS_UNCONNECTED;
2329         tcp_socket->type=SOCK_RAW;
2330
2331         if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2332                 panic("Failed to create the TCP control socket.\n");
2333         tcp_socket->sk->allocation=GFP_ATOMIC;
2334         tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2335
2336         /* Unhash it so that IP input processing does not even
2337          * see it, we do not wish this socket to see incoming
2338          * packets.
2339          */
2340         tcp_socket->sk->prot->unhash(tcp_socket->sk);
2341 }