2 * NET4: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan.cox@linux.org>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Version: $Id: af_unix.c,v 1.126.2.5 2002/03/05 12:47:34 davem Exp $
14 * Linus Torvalds : Assorted bug cures.
15 * Niibe Yutaka : async I/O support.
16 * Carsten Paeth : PF_UNIX check, address fixes.
17 * Alan Cox : Limit size of allocated blocks.
18 * Alan Cox : Fixed the stupid socketpair bug.
19 * Alan Cox : BSD compatibility fine tuning.
20 * Alan Cox : Fixed a bug in connect when interrupted.
21 * Alan Cox : Sorted out a proper draft version of
22 * file descriptor passing hacked up from
24 * Marty Leisner : Fixes to fd passing
25 * Nick Nevin : recvmsg bugfix.
26 * Alan Cox : Started proper garbage collector
27 * Heiko EiBfeldt : Missing verify_area check
28 * Alan Cox : Started POSIXisms
29 * Andreas Schwab : Replace inode by dentry for proper
31 * Kirk Petersen : Made this a module
32 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
34 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
35 * by above two patches.
36 * Andrea Arcangeli : If possible we block in connect(2)
37 * if the max backlog of the listen socket
38 * is been reached. This won't break
39 * old apps and it will avoid huge amount
40 * of socks hashed (this for unix_gc()
41 * performances reasons).
42 * Security fix that limits the max
43 * number of socks to 2*max_files and
44 * the number of skb queueable in the
46 * Artur Skawina : Hash function optimizations
47 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
48 * Malcolm Beattie : Set peercred for socketpair
49 * Michal Ostrowski : Module initialization cleanup.
52 * Known differences from reference BSD that was tested:
55 * ECONNREFUSED is not returned from one end of a connected() socket to the
56 * other the moment one end closes.
57 * fstat() doesn't return st_dev=NODEV, and give the blksize as high water mark
58 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 * accept() returns a path name even if the connecting socket has closed
61 * in the meantime (BSD loses the path and gives up).
62 * accept() returns 0 length path for an unbound connector. BSD returns 16
63 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
64 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
65 * BSD af_unix apparently has connect forgetting to block properly.
66 * (need to check this with the POSIX spec in detail)
68 * Differences from 2.0.0-11-... (ANK)
69 * Bug fixes and improvements.
70 * - client shutdown killed server socket.
71 * - removed all useless cli/sti pairs.
73 * Semantic changes/extensions.
74 * - generic control message passing.
75 * - SCM_CREDENTIALS control message.
76 * - "Abstract" (not FS based) socket bindings.
77 * Abstract names are sequences of bytes (not zero terminated)
78 * started by 0, so that this name space does not intersect
82 #include <linux/module.h>
83 #include <linux/config.h>
84 #include <linux/kernel.h>
85 #include <linux/major.h>
86 #include <linux/signal.h>
87 #include <linux/sched.h>
88 #include <linux/errno.h>
89 #include <linux/string.h>
90 #include <linux/stat.h>
91 #include <linux/socket.h>
93 #include <linux/fcntl.h>
94 #include <linux/termios.h>
95 #include <linux/sockios.h>
96 #include <linux/net.h>
99 #include <linux/slab.h>
100 #include <asm/uaccess.h>
101 #include <linux/skbuff.h>
102 #include <linux/netdevice.h>
103 #include <net/sock.h>
104 #include <linux/tcp.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/smp_lock.h>
111 #include <linux/rtnetlink.h>
113 #include <asm/checksum.h>
115 int sysctl_unix_max_dgram_qlen = 10;
117 unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
118 rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
119 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
121 #define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE])
123 #define UNIX_ABSTRACT(sk) ((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE)
126 * SMP locking strategy:
127 * hash table is protected with rwlock unix_table_lock
128 * each socket state is protected by separate rwlock.
131 static inline unsigned unix_hash_fold(unsigned hash)
135 return hash&(UNIX_HASH_SIZE-1);
138 #define unix_peer(sk) ((sk)->pair)
140 static inline int unix_our_peer(unix_socket *sk, unix_socket *osk)
142 return unix_peer(osk) == sk;
145 static inline int unix_may_send(unix_socket *sk, unix_socket *osk)
147 return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
150 static inline unix_socket * unix_peer_get(unix_socket *s)
158 unix_state_runlock(s);
162 extern inline void unix_release_addr(struct unix_address *addr)
164 if (atomic_dec_and_test(&addr->refcnt))
169 * Check unix socket name:
170 * - should be not zero length.
171 * - if started by not zero, should be NULL terminated (FS object)
172 * - if started by zero, it is abstract name.
175 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
177 if (len <= sizeof(short) || len > sizeof(*sunaddr))
179 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
181 if (sunaddr->sun_path[0]) {
182 ((char *)sunaddr)[len]=0;
183 len = strlen(sunaddr->sun_path)+1+sizeof(short);
187 *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
191 static void __unix_remove_socket(unix_socket *sk)
193 unix_socket **list = sk->protinfo.af_unix.list;
196 sk->next->prev = sk->prev;
198 sk->prev->next = sk->next;
201 sk->protinfo.af_unix.list = NULL;
208 static void __unix_insert_socket(unix_socket **list, unix_socket *sk)
210 BUG_TRAP(sk->protinfo.af_unix.list==NULL);
212 sk->protinfo.af_unix.list = list;
221 static inline void unix_remove_socket(unix_socket *sk)
223 write_lock(&unix_table_lock);
224 __unix_remove_socket(sk);
225 write_unlock(&unix_table_lock);
228 static inline void unix_insert_socket(unix_socket **list, unix_socket *sk)
230 write_lock(&unix_table_lock);
231 __unix_insert_socket(list, sk);
232 write_unlock(&unix_table_lock);
235 static unix_socket *__unix_find_socket_byname(struct sockaddr_un *sunname,
236 int len, int type, unsigned hash)
240 for (s=unix_socket_table[hash^type]; s; s=s->next) {
241 if(s->protinfo.af_unix.addr->len==len &&
242 memcmp(s->protinfo.af_unix.addr->name, sunname, len) == 0)
248 static inline unix_socket *
249 unix_find_socket_byname(struct sockaddr_un *sunname,
250 int len, int type, unsigned hash)
254 read_lock(&unix_table_lock);
255 s = __unix_find_socket_byname(sunname, len, type, hash);
258 read_unlock(&unix_table_lock);
262 static unix_socket *unix_find_socket_byinode(struct inode *i)
266 read_lock(&unix_table_lock);
267 for (s=unix_socket_table[i->i_ino & (UNIX_HASH_SIZE-1)]; s; s=s->next)
269 struct dentry *dentry = s->protinfo.af_unix.dentry;
271 if(dentry && dentry->d_inode == i)
277 read_unlock(&unix_table_lock);
281 static inline int unix_writable(struct sock *sk)
283 return ((atomic_read(&sk->wmem_alloc)<<2) <= sk->sndbuf);
286 static void unix_write_space(struct sock *sk)
288 read_lock(&sk->callback_lock);
289 if (unix_writable(sk)) {
290 if (sk->sleep && waitqueue_active(sk->sleep))
291 wake_up_interruptible(sk->sleep);
292 sk_wake_async(sk, 2, POLL_OUT);
294 read_unlock(&sk->callback_lock);
297 /* When dgram socket disconnects (or changes its peer), we clear its receive
298 * queue of packets arrived from previous peer. First, it allows to do
299 * flow control based only on wmem_alloc; second, sk connected to peer
300 * may receive messages only from that peer. */
301 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
303 if (skb_queue_len(&sk->receive_queue)) {
304 skb_queue_purge(&sk->receive_queue);
305 wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait);
307 /* If one link of bidirectional dgram pipe is disconnected,
308 * we signal error. Messages are lost. Do not make this,
309 * when peer was not connected to us.
311 if (!other->dead && unix_peer(other) == sk) {
312 other->err = ECONNRESET;
313 other->error_report(other);
318 static void unix_sock_destructor(struct sock *sk)
320 skb_queue_purge(&sk->receive_queue);
322 BUG_TRAP(atomic_read(&sk->wmem_alloc) == 0);
323 BUG_TRAP(sk->protinfo.af_unix.list==NULL);
324 BUG_TRAP(sk->socket==NULL);
326 printk("Attempt to release alive unix socket: %p\n", sk);
330 if (sk->protinfo.af_unix.addr)
331 unix_release_addr(sk->protinfo.af_unix.addr);
333 atomic_dec(&unix_nr_socks);
334 #ifdef UNIX_REFCNT_DEBUG
335 printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
340 static int unix_release_sock (unix_socket *sk, int embrion)
342 struct dentry *dentry;
343 struct vfsmount *mnt;
348 unix_remove_socket(sk);
351 unix_state_wlock(sk);
353 sk->shutdown = SHUTDOWN_MASK;
354 dentry = sk->protinfo.af_unix.dentry;
355 sk->protinfo.af_unix.dentry=NULL;
356 mnt = sk->protinfo.af_unix.mnt;
357 sk->protinfo.af_unix.mnt=NULL;
359 sk->state = TCP_CLOSE;
360 unix_state_wunlock(sk);
362 wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait);
364 skpair=unix_peer(sk);
367 if (sk->type==SOCK_STREAM) {
368 unix_state_wlock(skpair);
369 skpair->shutdown=SHUTDOWN_MASK; /* No more writes*/
370 if (!skb_queue_empty(&sk->receive_queue) || embrion)
371 skpair->err = ECONNRESET;
372 unix_state_wunlock(skpair);
373 skpair->state_change(skpair);
374 read_lock(&skpair->callback_lock);
375 sk_wake_async(skpair,1,POLL_HUP);
376 read_unlock(&skpair->callback_lock);
378 sock_put(skpair); /* It may now die */
379 unix_peer(sk) = NULL;
382 /* Try to flush out this socket. Throw out buffers at least */
384 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
386 if (state==TCP_LISTEN)
387 unix_release_sock(skb->sk, 1);
388 /* passed fds are erased in the kfree_skb hook */
399 /* ---- Socket is dead now and most probably destroyed ---- */
402 * Fixme: BSD difference: In BSD all sockets connected to use get
403 * ECONNRESET and we die on the spot. In Linux we behave
404 * like files and pipes do and wait for the last
407 * Can't we simply set sock->err?
409 * What the above comment does talk about? --ANK(980817)
412 if (atomic_read(&unix_tot_inflight))
413 unix_gc(); /* Garbage collect fds */
418 static int unix_listen(struct socket *sock, int backlog)
421 struct sock *sk = sock->sk;
424 if (sock->type!=SOCK_STREAM)
425 goto out; /* Only stream sockets accept */
427 if (!sk->protinfo.af_unix.addr)
428 goto out; /* No listens on an unbound socket */
429 unix_state_wlock(sk);
430 if (sk->state != TCP_CLOSE && sk->state != TCP_LISTEN)
432 if (backlog > sk->max_ack_backlog)
433 wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait);
434 sk->max_ack_backlog=backlog;
435 sk->state=TCP_LISTEN;
436 /* set credentials so connect can copy them */
437 sk->peercred.pid = current->pid;
438 sk->peercred.uid = current->euid;
439 sk->peercred.gid = current->egid;
443 unix_state_wunlock(sk);
448 extern struct proto_ops unix_stream_ops;
449 extern struct proto_ops unix_dgram_ops;
451 static struct sock * unix_create1(struct socket *sock)
455 if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
459 sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
465 atomic_inc(&unix_nr_socks);
467 sock_init_data(sock,sk);
469 sk->write_space = unix_write_space;
471 sk->max_ack_backlog = sysctl_unix_max_dgram_qlen;
472 sk->destruct = unix_sock_destructor;
473 sk->protinfo.af_unix.dentry=NULL;
474 sk->protinfo.af_unix.mnt=NULL;
475 sk->protinfo.af_unix.lock = RW_LOCK_UNLOCKED;
476 atomic_set(&sk->protinfo.af_unix.inflight, sock ? 0 : -1);
477 init_MUTEX(&sk->protinfo.af_unix.readsem);/* single task reading lock */
478 init_waitqueue_head(&sk->protinfo.af_unix.peer_wait);
479 sk->protinfo.af_unix.list=NULL;
480 unix_insert_socket(&unix_sockets_unbound, sk);
485 static int unix_create(struct socket *sock, int protocol)
487 if (protocol && protocol != PF_UNIX)
488 return -EPROTONOSUPPORT;
490 sock->state = SS_UNCONNECTED;
492 switch (sock->type) {
494 sock->ops = &unix_stream_ops;
497 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
501 sock->type=SOCK_DGRAM;
503 sock->ops = &unix_dgram_ops;
506 return -ESOCKTNOSUPPORT;
509 return unix_create1(sock) ? 0 : -ENOMEM;
512 static int unix_release(struct socket *sock)
514 unix_socket *sk = sock->sk;
521 return unix_release_sock (sk, 0);
524 static int unix_autobind(struct socket *sock)
526 struct sock *sk = sock->sk;
527 static u32 ordernum = 1;
528 struct unix_address * addr;
531 down(&sk->protinfo.af_unix.readsem);
534 if (sk->protinfo.af_unix.addr)
538 addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
542 memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
543 addr->name->sun_family = AF_UNIX;
544 atomic_set(&addr->refcnt, 1);
547 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
548 addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
550 write_lock(&unix_table_lock);
551 ordernum = (ordernum+1)&0xFFFFF;
553 if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
555 write_unlock(&unix_table_lock);
556 /* Sanity yield. It is unusual case, but yet... */
557 if (!(ordernum&0xFF))
561 addr->hash ^= sk->type;
563 __unix_remove_socket(sk);
564 sk->protinfo.af_unix.addr = addr;
565 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
566 write_unlock(&unix_table_lock);
570 up(&sk->protinfo.af_unix.readsem);
574 static unix_socket *unix_find_other(struct sockaddr_un *sunname, int len,
575 int type, unsigned hash, int *error)
581 if (sunname->sun_path[0]) {
582 if (path_init(sunname->sun_path,
583 LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
584 err = path_walk(sunname->sun_path, &nd);
587 err = permission(nd.dentry->d_inode,MAY_WRITE);
592 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
594 u=unix_find_socket_byinode(nd.dentry->d_inode);
599 UPDATE_ATIME(nd.dentry->d_inode);
604 if (u->type != type) {
610 u=unix_find_socket_byname(sunname, len, type, hash);
612 struct dentry *dentry;
613 dentry = u->protinfo.af_unix.dentry;
615 UPDATE_ATIME(dentry->d_inode);
629 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
631 struct sock *sk = sock->sk;
632 struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
633 struct dentry * dentry = NULL;
637 struct unix_address *addr;
641 if (sunaddr->sun_family != AF_UNIX)
644 if (addr_len==sizeof(short)) {
645 err = unix_autobind(sock);
649 err = unix_mkname(sunaddr, addr_len, &hash);
654 down(&sk->protinfo.af_unix.readsem);
657 if (sk->protinfo.af_unix.addr)
661 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
665 memcpy(addr->name, sunaddr, addr_len);
666 addr->len = addr_len;
667 addr->hash = hash^sk->type;
668 atomic_set(&addr->refcnt, 1);
670 if (sunaddr->sun_path[0]) {
674 * Get the parent directory, calculate the hash for last
677 if (path_init(sunaddr->sun_path, LOOKUP_PARENT, &nd))
678 err = path_walk(sunaddr->sun_path, &nd);
680 goto out_mknod_parent;
682 * Yucky last component or no last component at all?
683 * (foo/., foo/.., /////)
686 if (nd.last_type != LAST_NORM)
689 * Lock the directory.
691 down(&nd.dentry->d_inode->i_sem);
693 * Do the final lookup.
695 dentry = lookup_hash(&nd.last, nd.dentry);
696 err = PTR_ERR(dentry);
698 goto out_mknod_unlock;
701 * Special case - lookup gave negative, but... we had foo/bar/
702 * From the vfs_mknod() POV we just have a negative dentry -
703 * all is fine. Let's be bastards - you had / on the end, you've
704 * been asking for (non-existent) directory. -ENOENT for you.
706 if (nd.last.name[nd.last.len] && !dentry->d_inode)
709 * All right, let's create it.
711 mode = S_IFSOCK | (sock->inode->i_mode & ~current->fs->umask);
712 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
715 up(&nd.dentry->d_inode->i_sem);
719 addr->hash = UNIX_HASH_SIZE;
722 write_lock(&unix_table_lock);
724 if (!sunaddr->sun_path[0]) {
726 if (__unix_find_socket_byname(sunaddr, addr_len,
728 unix_release_addr(addr);
732 list = &unix_socket_table[addr->hash];
734 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
735 sk->protinfo.af_unix.dentry = nd.dentry;
736 sk->protinfo.af_unix.mnt = nd.mnt;
740 __unix_remove_socket(sk);
741 sk->protinfo.af_unix.addr = addr;
742 __unix_insert_socket(list, sk);
745 write_unlock(&unix_table_lock);
747 up(&sk->protinfo.af_unix.readsem);
754 up(&nd.dentry->d_inode->i_sem);
760 unix_release_addr(addr);
764 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
767 struct sock *sk = sock->sk;
768 struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
773 if (addr->sa_family != AF_UNSPEC) {
774 err = unix_mkname(sunaddr, alen, &hash);
779 if (sock->passcred && !sk->protinfo.af_unix.addr &&
780 (err = unix_autobind(sock)) != 0)
783 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
787 unix_state_wlock(sk);
790 if (!unix_may_send(sk, other))
794 * 1003.1g breaking connected state with AF_UNSPEC
797 unix_state_wlock(sk);
801 * If it was connected, reconnect.
804 struct sock *old_peer = unix_peer(sk);
806 unix_state_wunlock(sk);
808 if (other != old_peer)
809 unix_dgram_disconnected(sk, old_peer);
813 unix_state_wunlock(sk);
818 unix_state_wunlock(sk);
824 static long unix_wait_for_peer(unix_socket *other, long timeo)
827 DECLARE_WAITQUEUE(wait, current);
829 __set_current_state(TASK_INTERRUPTIBLE);
830 add_wait_queue_exclusive(&other->protinfo.af_unix.peer_wait, &wait);
832 sched = (!other->dead &&
833 !(other->shutdown&RCV_SHUTDOWN) &&
834 skb_queue_len(&other->receive_queue) > other->max_ack_backlog);
836 unix_state_runlock(other);
839 timeo = schedule_timeout(timeo);
841 __set_current_state(TASK_RUNNING);
842 remove_wait_queue(&other->protinfo.af_unix.peer_wait, &wait);
846 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
847 int addr_len, int flags)
849 struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
850 struct sock *sk = sock->sk;
851 struct sock *newsk = NULL;
852 unix_socket *other = NULL;
853 struct sk_buff *skb = NULL;
859 err = unix_mkname(sunaddr, addr_len, &hash);
864 if (sock->passcred && !sk->protinfo.af_unix.addr &&
865 (err = unix_autobind(sock)) != 0)
868 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
870 /* First of all allocate resources.
871 If we will make it after state is locked,
872 we will have to recheck all again in any case.
877 /* create new sock for complete connection */
878 newsk = unix_create1(NULL);
882 /* Allocate skb for sending to listening sock */
883 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
888 /* Find listening sock. */
889 other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
893 /* Latch state of peer */
894 unix_state_rlock(other);
896 /* Apparently VFS overslept socket death. Retry. */
898 unix_state_runlock(other);
904 if (other->state != TCP_LISTEN)
907 if (skb_queue_len(&other->receive_queue) > other->max_ack_backlog) {
912 timeo = unix_wait_for_peer(other, timeo);
914 err = sock_intr_errno(timeo);
915 if (signal_pending(current))
923 It is tricky place. We need to grab write lock and cannot
924 drop lock on peer. It is dangerous because deadlock is
925 possible. Connect to self case and simultaneous
926 attempt to connect are eliminated by checking socket
927 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
928 check this before attempt to grab lock.
930 Well, and we have to recheck the state after socket locked.
936 /* This is ok... continue with connect */
938 case TCP_ESTABLISHED:
939 /* Socket is already connected */
947 unix_state_wlock(sk);
949 if (sk->state != st) {
950 unix_state_wunlock(sk);
951 unix_state_runlock(other);
956 /* The way is open! Fastly set all the necessary fields... */
960 newsk->state=TCP_ESTABLISHED;
961 newsk->type=SOCK_STREAM;
962 newsk->peercred.pid = current->pid;
963 newsk->peercred.uid = current->euid;
964 newsk->peercred.gid = current->egid;
965 newsk->sleep = &newsk->protinfo.af_unix.peer_wait;
967 /* copy address information from listening to new sock*/
968 if (other->protinfo.af_unix.addr)
970 atomic_inc(&other->protinfo.af_unix.addr->refcnt);
971 newsk->protinfo.af_unix.addr=other->protinfo.af_unix.addr;
973 if (other->protinfo.af_unix.dentry) {
974 newsk->protinfo.af_unix.dentry=dget(other->protinfo.af_unix.dentry);
975 newsk->protinfo.af_unix.mnt=mntget(other->protinfo.af_unix.mnt);
978 /* Set credentials */
979 sk->peercred = other->peercred;
983 sock->state=SS_CONNECTED;
984 sk->state=TCP_ESTABLISHED;
986 unix_state_wunlock(sk);
988 /* take ten and and send info to listening sock */
989 spin_lock(&other->receive_queue.lock);
990 __skb_queue_tail(&other->receive_queue,skb);
991 /* Undo artificially decreased inflight after embrion
992 * is installed to listening socket. */
993 atomic_inc(&newsk->protinfo.af_unix.inflight);
994 spin_unlock(&other->receive_queue.lock);
995 unix_state_runlock(other);
996 other->data_ready(other, 0);
1002 unix_state_runlock(other);
1008 unix_release_sock(newsk, 0);
1014 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1016 struct sock *ska=socka->sk, *skb = sockb->sk;
1018 /* Join our sockets back to back */
1023 ska->peercred.pid = skb->peercred.pid = current->pid;
1024 ska->peercred.uid = skb->peercred.uid = current->euid;
1025 ska->peercred.gid = skb->peercred.gid = current->egid;
1027 if (ska->type != SOCK_DGRAM)
1029 ska->state=TCP_ESTABLISHED;
1030 skb->state=TCP_ESTABLISHED;
1031 socka->state=SS_CONNECTED;
1032 sockb->state=SS_CONNECTED;
1037 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1039 unix_socket *sk = sock->sk;
1041 struct sk_buff *skb;
1045 if (sock->type!=SOCK_STREAM)
1049 if (sk->state!=TCP_LISTEN)
1052 /* If socket state is TCP_LISTEN it cannot change (for now...),
1053 * so that no locks are necessary.
1056 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1058 /* This means receive shutdown. */
1065 skb_free_datagram(sk, skb);
1066 wake_up_interruptible(&sk->protinfo.af_unix.peer_wait);
1068 /* attach accepted sock to socket */
1069 unix_state_wlock(tsk);
1070 newsock->state = SS_CONNECTED;
1071 sock_graft(tsk, newsock);
1072 unix_state_wunlock(tsk);
1080 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1082 struct sock *sk = sock->sk;
1083 struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1087 sk = unix_peer_get(sk);
1097 unix_state_rlock(sk);
1098 if (!sk->protinfo.af_unix.addr) {
1099 sunaddr->sun_family = AF_UNIX;
1100 sunaddr->sun_path[0] = 0;
1101 *uaddr_len = sizeof(short);
1103 struct unix_address *addr = sk->protinfo.af_unix.addr;
1105 *uaddr_len = addr->len;
1106 memcpy(sunaddr, addr->name, *uaddr_len);
1108 unix_state_runlock(sk);
1114 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1118 scm->fp = UNIXCB(skb).fp;
1119 skb->destructor = sock_wfree;
1120 UNIXCB(skb).fp = NULL;
1122 for (i=scm->fp->count-1; i>=0; i--)
1123 unix_notinflight(scm->fp->fp[i]);
1126 static void unix_destruct_fds(struct sk_buff *skb)
1128 struct scm_cookie scm;
1129 memset(&scm, 0, sizeof(scm));
1130 unix_detach_fds(&scm, skb);
1132 /* Alas, it calls VFS */
1133 /* So fscking what? fput() had been SMP-safe since the last Summer */
1138 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1141 for (i=scm->fp->count-1; i>=0; i--)
1142 unix_inflight(scm->fp->fp[i]);
1143 UNIXCB(skb).fp = scm->fp;
1144 skb->destructor = unix_destruct_fds;
1149 * Send AF_UNIX data.
1152 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len,
1153 struct scm_cookie *scm)
1155 struct sock *sk = sock->sk;
1156 struct sockaddr_un *sunaddr=msg->msg_name;
1157 unix_socket *other = NULL;
1158 int namelen = 0; /* fake GCC */
1161 struct sk_buff *skb;
1165 if (msg->msg_flags&MSG_OOB)
1168 if (msg->msg_namelen) {
1169 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1176 other = unix_peer_get(sk);
1181 if (sock->passcred && !sk->protinfo.af_unix.addr &&
1182 (err = unix_autobind(sock)) != 0)
1186 if ((unsigned)len > sk->sndbuf - 32)
1189 skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1193 memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred));
1195 unix_attach_fds(scm, skb);
1197 skb->h.raw = skb->data;
1198 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1202 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1207 if (sunaddr == NULL)
1210 other = unix_find_other(sunaddr, namelen, sk->type, hash, &err);
1215 unix_state_rlock(other);
1217 if (!unix_may_send(sk, other))
1222 * Check with 1003.1g - what should
1225 unix_state_runlock(other);
1229 unix_state_wlock(sk);
1230 if (unix_peer(sk) == other) {
1232 unix_state_wunlock(sk);
1234 unix_dgram_disconnected(sk, other);
1236 err = -ECONNREFUSED;
1238 unix_state_wunlock(sk);
1248 if (other->shutdown&RCV_SHUTDOWN)
1251 if (unix_peer(other) != sk &&
1252 skb_queue_len(&other->receive_queue) > other->max_ack_backlog) {
1258 timeo = unix_wait_for_peer(other, timeo);
1260 err = sock_intr_errno(timeo);
1261 if (signal_pending(current))
1267 skb_queue_tail(&other->receive_queue, skb);
1268 unix_state_runlock(other);
1269 other->data_ready(other, len);
1274 unix_state_runlock(other);
1284 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
1285 struct scm_cookie *scm)
1287 struct sock *sk = sock->sk;
1288 unix_socket *other = NULL;
1289 struct sockaddr_un *sunaddr=msg->msg_name;
1291 struct sk_buff *skb;
1295 if (msg->msg_flags&MSG_OOB)
1298 if (msg->msg_namelen) {
1299 err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP);
1304 other = unix_peer_get(sk);
1309 if (sk->shutdown&SEND_SHUTDOWN)
1315 * Optimisation for the fact that under 0.01% of X messages typically
1321 /* Keep two messages in the pipe so it schedules better */
1322 if (size > sk->sndbuf/2 - 64)
1323 size = sk->sndbuf/2 - 64;
1325 if (size > SKB_MAX_ALLOC)
1326 size = SKB_MAX_ALLOC;
1332 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1338 * If you pass two values to the sock_alloc_send_skb
1339 * it tries to grab the large buffer with GFP_NOFS
1340 * (which can fail easily), and if it fails grab the
1341 * fallback size buffer which is under a page and will
1344 size = min_t(int, size, skb_tailroom(skb));
1346 memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred));
1348 unix_attach_fds(scm, skb);
1350 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1355 unix_state_rlock(other);
1357 if (other->dead || (other->shutdown & RCV_SHUTDOWN))
1360 skb_queue_tail(&other->receive_queue, skb);
1361 unix_state_runlock(other);
1362 other->data_ready(other, size);
1369 unix_state_runlock(other);
1372 if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1373 send_sig(SIGPIPE,current,0);
1378 return sent ? : err;
1381 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1383 msg->msg_namelen = 0;
1384 if (sk->protinfo.af_unix.addr) {
1385 msg->msg_namelen=sk->protinfo.af_unix.addr->len;
1386 memcpy(msg->msg_name,
1387 sk->protinfo.af_unix.addr->name,
1388 sk->protinfo.af_unix.addr->len);
1392 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size,
1393 int flags, struct scm_cookie *scm)
1395 struct sock *sk = sock->sk;
1396 int noblock = flags & MSG_DONTWAIT;
1397 struct sk_buff *skb;
1404 msg->msg_namelen = 0;
1406 down(&sk->protinfo.af_unix.readsem);
1408 skb = skb_recv_datagram(sk, flags, noblock, &err);
1412 wake_up_interruptible(&sk->protinfo.af_unix.peer_wait);
1415 unix_copy_addr(msg, skb->sk);
1417 if (size > skb->len)
1419 else if (size < skb->len)
1420 msg->msg_flags |= MSG_TRUNC;
1422 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1426 scm->creds = *UNIXCREDS(skb);
1428 if (!(flags & MSG_PEEK))
1431 unix_detach_fds(scm, skb);
1435 /* It is questionable: on PEEK we could:
1436 - do not return fds - good, but too simple 8)
1437 - return fds, and do not return them on read (old strategy,
1439 - clone fds (I choosed it for now, it is the most universal
1442 POSIX 1003.1g does not actually define this clearly
1443 at all. POSIX 1003.1g doesn't define a lot of things
1448 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1453 skb_free_datagram(sk,skb);
1455 up(&sk->protinfo.af_unix.readsem);
1461 * Sleep until data has arrive. But check for races..
1464 static long unix_stream_data_wait(unix_socket * sk, long timeo)
1466 DECLARE_WAITQUEUE(wait, current);
1468 unix_state_rlock(sk);
1470 add_wait_queue(sk->sleep, &wait);
1473 set_current_state(TASK_INTERRUPTIBLE);
1475 if (skb_queue_len(&sk->receive_queue) ||
1477 (sk->shutdown & RCV_SHUTDOWN) ||
1478 signal_pending(current) ||
1482 set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1483 unix_state_runlock(sk);
1484 timeo = schedule_timeout(timeo);
1485 unix_state_rlock(sk);
1486 clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1489 __set_current_state(TASK_RUNNING);
1490 remove_wait_queue(sk->sleep, &wait);
1491 unix_state_runlock(sk);
1497 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size,
1498 int flags, struct scm_cookie *scm)
1500 struct sock *sk = sock->sk;
1501 struct sockaddr_un *sunaddr=msg->msg_name;
1503 int check_creds = 0;
1509 if (sk->state != TCP_ESTABLISHED)
1516 target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1517 timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1519 msg->msg_namelen = 0;
1521 /* Lock the socket to prevent queue disordering
1522 * while sleeps in memcpy_tomsg
1525 down(&sk->protinfo.af_unix.readsem);
1530 struct sk_buff *skb;
1532 skb=skb_dequeue(&sk->receive_queue);
1535 if (copied >= target)
1539 * POSIX 1003.1g mandates this order.
1542 if ((err = sock_error(sk)) != 0)
1544 if (sk->shutdown & RCV_SHUTDOWN)
1549 up(&sk->protinfo.af_unix.readsem);
1551 timeo = unix_stream_data_wait(sk, timeo);
1553 if (signal_pending(current)) {
1554 err = sock_intr_errno(timeo);
1557 down(&sk->protinfo.af_unix.readsem);
1562 /* Never glue messages from different writers */
1563 if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) {
1564 skb_queue_head(&sk->receive_queue, skb);
1568 /* Copy credentials */
1569 scm->creds = *UNIXCREDS(skb);
1573 /* Copy address just once */
1576 unix_copy_addr(msg, skb->sk);
1580 chunk = min_t(unsigned int, skb->len, size);
1581 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1582 skb_queue_head(&sk->receive_queue, skb);
1590 /* Mark read part of skb as used */
1591 if (!(flags & MSG_PEEK))
1593 skb_pull(skb, chunk);
1596 unix_detach_fds(scm, skb);
1598 /* put the skb back if we didn't use it up.. */
1601 skb_queue_head(&sk->receive_queue, skb);
1612 /* It is questionable, see note in unix_dgram_recvmsg.
1615 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1617 /* put message back and return */
1618 skb_queue_head(&sk->receive_queue, skb);
1623 up(&sk->protinfo.af_unix.readsem);
1625 return copied ? : err;
1628 static int unix_shutdown(struct socket *sock, int mode)
1630 struct sock *sk = sock->sk;
1633 mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1636 unix_state_wlock(sk);
1637 sk->shutdown |= mode;
1638 other=unix_peer(sk);
1641 unix_state_wunlock(sk);
1642 sk->state_change(sk);
1644 if (other && sk->type == SOCK_STREAM) {
1647 if (mode&RCV_SHUTDOWN)
1648 peer_mode |= SEND_SHUTDOWN;
1649 if (mode&SEND_SHUTDOWN)
1650 peer_mode |= RCV_SHUTDOWN;
1651 unix_state_wlock(other);
1652 other->shutdown |= peer_mode;
1653 unix_state_wunlock(other);
1654 other->state_change(other);
1655 read_lock(&other->callback_lock);
1656 if (peer_mode == SHUTDOWN_MASK)
1657 sk_wake_async(other,1,POLL_HUP);
1658 else if (peer_mode & RCV_SHUTDOWN)
1659 sk_wake_async(other,1,POLL_IN);
1660 read_unlock(&other->callback_lock);
1668 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1670 struct sock *sk = sock->sk;
1677 amount = atomic_read(&sk->wmem_alloc);
1678 err = put_user(amount, (int *)arg);
1682 struct sk_buff *skb;
1683 if (sk->state==TCP_LISTEN) {
1688 spin_lock(&sk->receive_queue.lock);
1689 if (sk->type == SOCK_STREAM) {
1690 skb_queue_walk(&sk->receive_queue, skb)
1693 if((skb=skb_peek(&sk->receive_queue))!=NULL)
1696 spin_unlock(&sk->receive_queue.lock);
1697 err = put_user(amount, (int *)arg);
1702 err = dev_ioctl(cmd, (void *)arg);
1708 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1710 struct sock *sk = sock->sk;
1713 poll_wait(file, sk->sleep, wait);
1716 /* exceptional events? */
1719 if (sk->shutdown == SHUTDOWN_MASK)
1723 if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN))
1724 mask |= POLLIN | POLLRDNORM;
1726 /* Connection-based need to check for termination and startup */
1727 if (sk->type == SOCK_STREAM && sk->state==TCP_CLOSE)
1731 * we set writable also when the other side has shut down the
1732 * connection. This prevents stuck sockets.
1734 if (unix_writable(sk))
1735 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1741 #ifdef CONFIG_PROC_FS
1742 static int unix_read_proc(char *buffer, char **start, off_t offset,
1743 int length, int *eof, void *data)
1751 len+= sprintf(buffer,"Num RefCount Protocol Flags Type St "
1754 read_lock(&unix_table_lock);
1755 forall_unix_sockets (i,s)
1757 unix_state_rlock(s);
1759 len+=sprintf(buffer+len,"%p: %08X %08X %08X %04X %02X %5lu",
1761 atomic_read(&s->refcnt),
1763 s->state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1766 (s->state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1767 (s->state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1770 if (s->protinfo.af_unix.addr)
1772 buffer[len++] = ' ';
1773 memcpy(buffer+len, s->protinfo.af_unix.addr->name->sun_path,
1774 s->protinfo.af_unix.addr->len-sizeof(short));
1775 if (!UNIX_ABSTRACT(s))
1779 len += s->protinfo.af_unix.addr->len - sizeof(short);
1781 unix_state_runlock(s);
1791 if(pos>offset+length)
1796 read_unlock(&unix_table_lock);
1797 *start=buffer+(offset-begin);
1798 len-=(offset-begin);
1807 struct proto_ops unix_stream_ops = {
1810 release: unix_release,
1812 connect: unix_stream_connect,
1813 socketpair: unix_socketpair,
1814 accept: unix_accept,
1815 getname: unix_getname,
1818 listen: unix_listen,
1819 shutdown: unix_shutdown,
1820 setsockopt: sock_no_setsockopt,
1821 getsockopt: sock_no_getsockopt,
1822 sendmsg: unix_stream_sendmsg,
1823 recvmsg: unix_stream_recvmsg,
1825 sendpage: sock_no_sendpage,
1828 struct proto_ops unix_dgram_ops = {
1831 release: unix_release,
1833 connect: unix_dgram_connect,
1834 socketpair: unix_socketpair,
1835 accept: sock_no_accept,
1836 getname: unix_getname,
1837 poll: datagram_poll,
1839 listen: sock_no_listen,
1840 shutdown: unix_shutdown,
1841 setsockopt: sock_no_setsockopt,
1842 getsockopt: sock_no_getsockopt,
1843 sendmsg: unix_dgram_sendmsg,
1844 recvmsg: unix_dgram_recvmsg,
1846 sendpage: sock_no_sendpage,
1849 struct net_proto_family unix_family_ops = {
1854 #ifdef CONFIG_SYSCTL
1855 extern void unix_sysctl_register(void);
1856 extern void unix_sysctl_unregister(void);
1858 static inline void unix_sysctl_register(void) {}
1859 static inline void unix_sysctl_unregister(void) {}
1862 static char banner[] __initdata = KERN_INFO "NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.\n";
1864 static int __init af_unix_init(void)
1866 struct sk_buff *dummy_skb;
1869 if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb))
1871 printk(KERN_CRIT "unix_proto_init: panic\n");
1874 sock_register(&unix_family_ops);
1875 #ifdef CONFIG_PROC_FS
1876 create_proc_read_entry("net/unix", 0, 0, unix_read_proc, NULL);
1878 unix_sysctl_register();
1882 static void __exit af_unix_exit(void)
1884 sock_unregister(PF_UNIX);
1885 unix_sysctl_unregister();
1886 remove_proc_entry("net/unix", 0);
1889 module_init(af_unix_init);
1890 module_exit(af_unix_exit);
1892 MODULE_LICENSE("GPL");