Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
authorLinus Torvalds <torvalds@woody.linux-foundation.org>
Mon, 21 May 2007 23:19:32 +0000 (16:19 -0700)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Mon, 21 May 2007 23:19:32 +0000 (16:19 -0700)
* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband:
  IB/cm: Improve local id allocation
  IPoIB/cm: Fix SRQ WR leak
  IB/ipoib: Fix typos in error messages
  IB/mlx4: Check if SRQ is full when posting receive
  IB/mlx4: Pass send queue sizes from userspace to kernel
  IB/mlx4: Fix check of opcode in mlx4_ib_post_send()
  mlx4_core: Fix array overrun in dump_dev_cap_flags()
  IB/mlx4: Fix RESET to RESET and RESET to ERROR transitions
  IB/mthca: Fix RESET to ERROR transition
  IB/mlx4: Set GRH:HopLimit when sending globally routed MADs
  IB/mthca: Set GRH:HopLimit when building MLX headers
  IB/mlx4: Fix check of max_qp_dest_rdma in modify QP
  IB/mthca: Fix use-after-free on device restart
  IB/ehca: Return proper error code if register_mr fails
  IPoIB: Handle P_Key table reordering
  IB/core: Use start_port() and end_port()
  IB/core: Add helpers for uncached GID and P_Key searches
  IB/ipath: Fix potential deadlock with multicast spinlocks
  IB/core: Free umem when mm is already gone

19 files changed:
drivers/infiniband/core/cm.c
drivers/infiniband/core/device.c
drivers/infiniband/core/umem.c
drivers/infiniband/hw/ehca/ehca_mrmw.c
drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx4/srq.c
drivers/infiniband/hw/mlx4/user.h
drivers/infiniband/hw/mthca/mthca_av.c
drivers/infiniband/hw/mthca/mthca_main.c
drivers/infiniband/hw/mthca/mthca_qp.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/net/mlx4/fw.c
include/rdma/ib_verbs.h

index eff591d..e840434 100644 (file)
@@ -306,7 +306,9 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
        do {
                spin_lock_irqsave(&cm.lock, flags);
                ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
-                                       next_id++, &id);
+                                       next_id, &id);
+               if (!ret)
+                       next_id = ((unsigned) id + 1) & MAX_ID_MASK;
                spin_unlock_irqrestore(&cm.lock, flags);
        } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
 
index bcecf4d..3ada17c 100644 (file)
@@ -150,6 +150,18 @@ static int alloc_name(char *name)
        return 0;
 }
 
+static int start_port(struct ib_device *device)
+{
+       return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+       return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+               0 : device->phys_port_cnt;
+}
+
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -209,6 +221,45 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
        return 0;
 }
 
+static int read_port_table_lengths(struct ib_device *device)
+{
+       struct ib_port_attr *tprops = NULL;
+       int num_ports, ret = -ENOMEM;
+       u8 port_index;
+
+       tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+       if (!tprops)
+               goto out;
+
+       num_ports = end_port(device) - start_port(device) + 1;
+
+       device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+                                      GFP_KERNEL);
+       device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+                                     GFP_KERNEL);
+       if (!device->pkey_tbl_len || !device->gid_tbl_len)
+               goto err;
+
+       for (port_index = 0; port_index < num_ports; ++port_index) {
+               ret = ib_query_port(device, port_index + start_port(device),
+                                       tprops);
+               if (ret)
+                       goto err;
+               device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+               device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+       }
+
+       ret = 0;
+       goto out;
+
+err:
+       kfree(device->gid_tbl_len);
+       kfree(device->pkey_tbl_len);
+out:
+       kfree(tprops);
+       return ret;
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
@@ -240,10 +291,19 @@ int ib_register_device(struct ib_device *device)
        spin_lock_init(&device->event_handler_lock);
        spin_lock_init(&device->client_data_lock);
 
+       ret = read_port_table_lengths(device);
+       if (ret) {
+               printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+                      device->name);
+               goto out;
+       }
+
        ret = ib_device_register_sysfs(device);
        if (ret) {
                printk(KERN_WARNING "Couldn't register device %s with driver model\n",
                       device->name);
+               kfree(device->gid_tbl_len);
+               kfree(device->pkey_tbl_len);
                goto out;
        }
 
@@ -285,6 +345,9 @@ void ib_unregister_device(struct ib_device *device)
 
        list_del(&device->core_list);
 
+       kfree(device->gid_tbl_len);
+       kfree(device->pkey_tbl_len);
+
        mutex_unlock(&device_mutex);
 
        spin_lock_irqsave(&device->client_data_lock, flags);
@@ -507,10 +570,7 @@ int ib_query_port(struct ib_device *device,
                  u8 port_num,
                  struct ib_port_attr *port_attr)
 {
-       if (device->node_type == RDMA_NODE_IB_SWITCH) {
-               if (port_num)
-                       return -EINVAL;
-       } else if (port_num < 1 || port_num > device->phys_port_cnt)
+       if (port_num < start_port(device) || port_num > end_port(device))
                return -EINVAL;
 
        return device->query_port(device, port_num, port_attr);
@@ -582,10 +642,7 @@ int ib_modify_port(struct ib_device *device,
                   u8 port_num, int port_modify_mask,
                   struct ib_port_modify *port_modify)
 {
-       if (device->node_type == RDMA_NODE_IB_SWITCH) {
-               if (port_num)
-                       return -EINVAL;
-       } else if (port_num < 1 || port_num > device->phys_port_cnt)
+       if (port_num < start_port(device) || port_num > end_port(device))
                return -EINVAL;
 
        return device->modify_port(device, port_num, port_modify_mask,
@@ -593,6 +650,68 @@ int ib_modify_port(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_modify_port);
 
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+               u8 *port_num, u16 *index)
+{
+       union ib_gid tmp_gid;
+       int ret, port, i;
+
+       for (port = start_port(device); port <= end_port(device); ++port) {
+               for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+                       ret = ib_query_gid(device, port, i, &tmp_gid);
+                       if (ret)
+                               return ret;
+                       if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+                               *port_num = port;
+                               if (index)
+                                       *index = i;
+                               return 0;
+                       }
+               }
+       }
+
+       return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+                u8 port_num, u16 pkey, u16 *index)
+{
+       int ret, i;
+       u16 tmp_pkey;
+
+       for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+               ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+               if (ret)
+                       return ret;
+
+               if (pkey == tmp_pkey) {
+                       *index = i;
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
 static int __init ib_core_init(void)
 {
        int ret;
index 96a16c0..b4aec51 100644 (file)
@@ -210,8 +210,10 @@ void ib_umem_release(struct ib_umem *umem)
        __ib_umem_release(umem->context->device, umem, 1);
 
        mm = get_task_mm(current);
-       if (!mm)
+       if (!mm) {
+               kfree(umem);
                return;
+       }
 
        diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
 
index 84c5bb4..add79bd 100644 (file)
@@ -2050,13 +2050,10 @@ int ehca_mrmw_map_hrc_alloc(const u64 hipz_rc)
        switch (hipz_rc) {
        case H_SUCCESS:              /* successful completion */
                return 0;
-       case H_ADAPTER_PARM:         /* invalid adapter handle */
-       case H_RT_PARM:              /* invalid resource type */
        case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
-       case H_MLENGTH_PARM:         /* invalid memory length */
-       case H_MEM_ACCESS_PARM:      /* invalid access controls */
        case H_CONSTRAINED:          /* resource constraint */
-               return -EINVAL;
+       case H_NO_MEM:
+               return -ENOMEM;
        case H_BUSY:                 /* long busy */
                return -EBUSY;
        default:
index 085e28b..dd691cf 100644 (file)
@@ -165,10 +165,9 @@ static int ipath_mcast_add(struct ipath_ibdev *dev,
 {
        struct rb_node **n = &mcast_tree.rb_node;
        struct rb_node *pn = NULL;
-       unsigned long flags;
        int ret;
 
-       spin_lock_irqsave(&mcast_lock, flags);
+       spin_lock_irq(&mcast_lock);
 
        while (*n) {
                struct ipath_mcast *tmcast;
@@ -228,7 +227,7 @@ static int ipath_mcast_add(struct ipath_ibdev *dev,
        ret = 0;
 
 bail:
-       spin_unlock_irqrestore(&mcast_lock, flags);
+       spin_unlock_irq(&mcast_lock);
 
        return ret;
 }
@@ -289,17 +288,16 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
        struct ipath_mcast *mcast = NULL;
        struct ipath_mcast_qp *p, *tmp;
        struct rb_node *n;
-       unsigned long flags;
        int last = 0;
        int ret;
 
-       spin_lock_irqsave(&mcast_lock, flags);
+       spin_lock_irq(&mcast_lock);
 
        /* Find the GID in the mcast table. */
        n = mcast_tree.rb_node;
        while (1) {
                if (n == NULL) {
-                       spin_unlock_irqrestore(&mcast_lock, flags);
+                       spin_unlock_irq(&mcast_lock);
                        ret = -EINVAL;
                        goto bail;
                }
@@ -334,7 +332,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
                break;
        }
 
-       spin_unlock_irqrestore(&mcast_lock, flags);
+       spin_unlock_irq(&mcast_lock);
 
        if (p) {
                /*
@@ -348,9 +346,9 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
                atomic_dec(&mcast->refcount);
                wait_event(mcast->wait, !atomic_read(&mcast->refcount));
                ipath_mcast_free(mcast);
-               spin_lock(&dev->n_mcast_grps_lock);
+               spin_lock_irq(&dev->n_mcast_grps_lock);
                dev->n_mcast_grps_allocated--;
-               spin_unlock(&dev->n_mcast_grps_lock);
+               spin_unlock_irq(&dev->n_mcast_grps_lock);
        }
 
        ret = 0;
index 5cd7069..a824bc5 100644 (file)
@@ -188,14 +188,32 @@ static int send_wqe_overhead(enum ib_qp_type type)
        }
 }
 
-static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-                      enum ib_qp_type type, struct mlx4_ib_qp *qp)
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+                      struct mlx4_ib_qp *qp)
 {
-       /* Sanity check QP size before proceeding */
+       /* Sanity check RQ size before proceeding */
+       if (cap->max_recv_wr  > dev->dev->caps.max_wqes  ||
+           cap->max_recv_sge > dev->dev->caps.max_rq_sg)
+               return -EINVAL;
+
+       qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
+
+       qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
+                                                   sizeof (struct mlx4_wqe_data_seg)));
+       qp->rq.max_gs    = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+
+       cap->max_recv_wr  = qp->rq.max;
+       cap->max_recv_sge = qp->rq.max_gs;
+
+       return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+                             enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+       /* Sanity check SQ size before proceeding */
        if (cap->max_send_wr     > dev->dev->caps.max_wqes  ||
-           cap->max_recv_wr     > dev->dev->caps.max_wqes  ||
            cap->max_send_sge    > dev->dev->caps.max_sq_sg ||
-           cap->max_recv_sge    > dev->dev->caps.max_rq_sg ||
            cap->max_inline_data + send_wqe_overhead(type) +
            sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
                return -EINVAL;
@@ -208,12 +226,7 @@ static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
            cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
                return -EINVAL;
 
-       qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
-       qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 0;
-
-       qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
-                                                   sizeof (struct mlx4_wqe_data_seg)));
-       qp->rq.max_gs    = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+       qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 1;
 
        qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
                                                        sizeof (struct mlx4_wqe_data_seg),
@@ -233,16 +246,26 @@ static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                qp->sq.offset = 0;
        }
 
-       cap->max_send_wr  = qp->sq.max;
-       cap->max_recv_wr  = qp->rq.max;
-       cap->max_send_sge = qp->sq.max_gs;
-       cap->max_recv_sge = qp->rq.max_gs;
+       cap->max_send_wr     = qp->sq.max;
+       cap->max_send_sge    = qp->sq.max_gs;
        cap->max_inline_data = (1 << qp->sq.wqe_shift) - send_wqe_overhead(type) -
                sizeof (struct mlx4_wqe_inline_seg);
 
        return 0;
 }
 
+static int set_user_sq_size(struct mlx4_ib_qp *qp,
+                           struct mlx4_ib_create_qp *ucmd)
+{
+       qp->sq.max       = 1 << ucmd->log_sq_bb_count;
+       qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+       qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
+               (qp->sq.max << qp->sq.wqe_shift);
+
+       return 0;
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
@@ -264,7 +287,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
        qp->sq.head         = 0;
        qp->sq.tail         = 0;
 
-       err = set_qp_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+       err = set_rq_size(dev, &init_attr->cap, qp);
        if (err)
                goto err;
 
@@ -276,6 +299,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                        goto err;
                }
 
+               err = set_user_sq_size(qp, &ucmd);
+               if (err)
+                       goto err;
+
                qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
                                       qp->buf_size, 0);
                if (IS_ERR(qp->umem)) {
@@ -297,6 +324,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                if (err)
                        goto err_mtt;
        } else {
+               err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+               if (err)
+                       goto err;
+
                err = mlx4_ib_db_alloc(dev, &qp->db, 0);
                if (err)
                        goto err;
@@ -573,7 +604,7 @@ static int to_mlx4_st(enum ib_qp_type type)
        }
 }
 
-static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *attr,
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
                                   int attr_mask)
 {
        u8 dest_rd_atomic;
@@ -603,7 +634,7 @@ static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *att
        return cpu_to_be32(hw_access_flags);
 }
 
-static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, struct ib_qp_attr *attr,
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
                            int attr_mask)
 {
        if (attr_mask & IB_QP_PKEY_INDEX)
@@ -619,7 +650,7 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
        path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
 }
 
-static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
                         struct mlx4_qp_path *path, u8 port)
 {
        path->grh_mylmc     = ah->src_path_bits & 0x7f;
@@ -655,14 +686,14 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
        return 0;
 }
 
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                     int attr_mask, struct ib_udata *udata)
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+                              const struct ib_qp_attr *attr, int attr_mask,
+                              enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
        struct mlx4_qp_context *context;
        enum mlx4_qp_optpar optpar = 0;
-       enum ib_qp_state cur_state, new_state;
        int sqd_event;
        int err = -EINVAL;
 
@@ -670,34 +701,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        if (!context)
                return -ENOMEM;
 
-       mutex_lock(&qp->mutex);
-
-       cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
-       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
-               goto out;
-
-       if ((attr_mask & IB_QP_PKEY_INDEX) &&
-            attr->pkey_index >= dev->dev->caps.pkey_table_len) {
-               goto out;
-       }
-
-       if ((attr_mask & IB_QP_PORT) &&
-           (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
-               goto out;
-       }
-
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-           attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
-               goto out;
-       }
-
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-           attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
-               goto out;
-       }
-
        context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
                                     (to_mlx4_st(ibqp->qp_type) << 16));
        context->flags     |= cpu_to_be32(1 << 8); /* DE? */
@@ -920,11 +923,84 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        }
 
 out:
-       mutex_unlock(&qp->mutex);
        kfree(context);
        return err;
 }
 
+static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
+static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
+               [IB_QPT_UD]  = (IB_QP_PKEY_INDEX                |
+                               IB_QP_PORT                      |
+                               IB_QP_QKEY),
+               [IB_QPT_UC]  = (IB_QP_PKEY_INDEX                |
+                               IB_QP_PORT                      |
+                               IB_QP_ACCESS_FLAGS),
+               [IB_QPT_RC]  = (IB_QP_PKEY_INDEX                |
+                               IB_QP_PORT                      |
+                               IB_QP_ACCESS_FLAGS),
+               [IB_QPT_SMI] = (IB_QP_PKEY_INDEX                |
+                               IB_QP_QKEY),
+               [IB_QPT_GSI] = (IB_QP_PKEY_INDEX                |
+                               IB_QP_QKEY),
+};
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                     int attr_mask, struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       enum ib_qp_state cur_state, new_state;
+       int err = -EINVAL;
+
+       mutex_lock(&qp->mutex);
+
+       cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+               goto out;
+
+       if ((attr_mask & IB_QP_PKEY_INDEX) &&
+            attr->pkey_index >= dev->dev->caps.pkey_table_len) {
+               goto out;
+       }
+
+       if ((attr_mask & IB_QP_PORT) &&
+           (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+               goto out;
+       }
+
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+           attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+               goto out;
+       }
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+           attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+               goto out;
+       }
+
+       if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+               err = 0;
+               goto out;
+       }
+
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+               err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
+                                         mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
+                                         IB_QPS_RESET, IB_QPS_INIT);
+               if (err)
+                       goto out;
+               cur_state = IB_QPS_INIT;
+       }
+
+       err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+       mutex_unlock(&qp->mutex);
+       return err;
+}
+
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
                            void *wqe)
 {
@@ -952,6 +1028,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
                        (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
                sqp->ud_header.grh.flow_label    =
                        ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+               sqp->ud_header.grh.hop_limit     = ah->av.hop_limit;
                ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
                                  ah->av.gid_index, &sqp->ud_header.grh.source_gid);
                memcpy(sqp->ud_header.grh.destination_gid.raw,
@@ -1192,7 +1269,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                 */
                wmb();
 
-               if (wr->opcode < 0 || wr->opcode > ARRAY_SIZE(mlx4_ib_opcode)) {
+               if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
                        err = -EINVAL;
                        goto out;
                }
index 42ab4a8..12fac1c 100644 (file)
@@ -297,6 +297,12 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
                        break;
                }
 
+               if (unlikely(srq->head == srq->tail)) {
+                       err = -ENOMEM;
+                       *bad_wr = wr;
+                       break;
+               }
+
                srq->wrid[srq->head] = wr->wr_id;
 
                next      = get_wqe(srq, srq->head);
index 5b8eddc..88c72d5 100644 (file)
@@ -39,7 +39,7 @@
  * Increment this value if any changes that break userspace ABI
  * compatibility are made.
  */
-#define MLX4_IB_UVERBS_ABI_VERSION     1
+#define MLX4_IB_UVERBS_ABI_VERSION     2
 
 /*
  * Make sure that all structs defined in this file remain laid out so
@@ -87,6 +87,9 @@ struct mlx4_ib_create_srq_resp {
 struct mlx4_ib_create_qp {
        __u64   buf_addr;
        __u64   db_addr;
+        __u8   log_sq_bb_count;
+        __u8   log_sq_stride;
+        __u8   reserved[6];
 };
 
 #endif /* MLX4_IB_USER_H */
index 27caf3b..4b111a8 100644 (file)
@@ -279,6 +279,7 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
                        (be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
                header->grh.flow_label    =
                        ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+               header->grh.hop_limit     = ah->av->hop_limit;
                ib_get_cached_gid(&dev->ib_dev,
                                  be32_to_cpu(ah->av->port_pd) >> 24,
                                  ah->av->gid_index % dev->limits.gid_table_len,
index 773145e..aa563e6 100644 (file)
@@ -1250,12 +1250,14 @@ static void __mthca_remove_one(struct pci_dev *pdev)
 int __mthca_restart_one(struct pci_dev *pdev)
 {
        struct mthca_dev *mdev;
+       int hca_type;
 
        mdev = pci_get_drvdata(pdev);
        if (!mdev)
                return -ENODEV;
+       hca_type = mdev->hca_type;
        __mthca_remove_one(pdev);
-       return __mthca_init_one(pdev, mdev->hca_type);
+       return __mthca_init_one(pdev, hca_type);
 }
 
 static int __devinit mthca_init_one(struct pci_dev *pdev,
index 2741ded..0276649 100644 (file)
@@ -296,7 +296,7 @@ static int to_mthca_st(int transport)
        }
 }
 
-static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
                        int attr_mask)
 {
        if (attr_mask & IB_QP_PKEY_INDEX)
@@ -328,7 +328,7 @@ static void init_port(struct mthca_dev *dev, int port)
                mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
 }
 
-static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr,
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
                                  int attr_mask)
 {
        u8 dest_rd_atomic;
@@ -511,7 +511,7 @@ out:
        return err;
 }
 
-static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah,
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
                          struct mthca_qp_path *path, u8 port)
 {
        path->g_mylmc     = ah->src_path_bits & 0x7f;
@@ -539,12 +539,12 @@ static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah,
        return 0;
 }
 
-int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                   struct ib_udata *udata)
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+                            const struct ib_qp_attr *attr, int attr_mask,
+                            enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
        struct mthca_dev *dev = to_mdev(ibqp->device);
        struct mthca_qp *qp = to_mqp(ibqp);
-       enum ib_qp_state cur_state, new_state;
        struct mthca_mailbox *mailbox;
        struct mthca_qp_param *qp_param;
        struct mthca_qp_context *qp_context;
@@ -552,60 +552,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
        u8 status;
        int err = -EINVAL;
 
-       mutex_lock(&qp->mutex);
-
-       if (attr_mask & IB_QP_CUR_STATE) {
-               cur_state = attr->cur_qp_state;
-       } else {
-               spin_lock_irq(&qp->sq.lock);
-               spin_lock(&qp->rq.lock);
-               cur_state = qp->state;
-               spin_unlock(&qp->rq.lock);
-               spin_unlock_irq(&qp->sq.lock);
-       }
-
-       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
-               mthca_dbg(dev, "Bad QP transition (transport %d) "
-                         "%d->%d with attr 0x%08x\n",
-                         qp->transport, cur_state, new_state,
-                         attr_mask);
-               goto out;
-       }
-
-       if (cur_state == new_state && cur_state == IB_QPS_RESET) {
-               err = 0;
-               goto out;
-       }
-
-       if ((attr_mask & IB_QP_PKEY_INDEX) &&
-            attr->pkey_index >= dev->limits.pkey_table_len) {
-               mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
-                         attr->pkey_index, dev->limits.pkey_table_len-1);
-               goto out;
-       }
-
-       if ((attr_mask & IB_QP_PORT) &&
-           (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
-               mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
-               goto out;
-       }
-
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-           attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
-               mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
-                         attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
-               goto out;
-       }
-
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-           attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
-               mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
-                         attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
-               goto out;
-       }
-
        mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
        if (IS_ERR(mailbox)) {
                err = PTR_ERR(mailbox);
@@ -892,6 +838,98 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 
 out_mailbox:
        mthca_free_mailbox(dev, mailbox);
+out:
+       return err;
+}
+
+static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 };
+static const int dummy_init_attr_mask[] = {
+       [IB_QPT_UD]  = (IB_QP_PKEY_INDEX                |
+                       IB_QP_PORT                      |
+                       IB_QP_QKEY),
+       [IB_QPT_UC]  = (IB_QP_PKEY_INDEX                |
+                       IB_QP_PORT                      |
+                       IB_QP_ACCESS_FLAGS),
+       [IB_QPT_RC]  = (IB_QP_PKEY_INDEX                |
+                       IB_QP_PORT                      |
+                       IB_QP_ACCESS_FLAGS),
+       [IB_QPT_SMI] = (IB_QP_PKEY_INDEX                |
+                       IB_QP_QKEY),
+       [IB_QPT_GSI] = (IB_QP_PKEY_INDEX                |
+                       IB_QP_QKEY),
+};
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+                   struct ib_udata *udata)
+{
+       struct mthca_dev *dev = to_mdev(ibqp->device);
+       struct mthca_qp *qp = to_mqp(ibqp);
+       enum ib_qp_state cur_state, new_state;
+       int err = -EINVAL;
+
+       mutex_lock(&qp->mutex);
+       if (attr_mask & IB_QP_CUR_STATE) {
+               cur_state = attr->cur_qp_state;
+       } else {
+               spin_lock_irq(&qp->sq.lock);
+               spin_lock(&qp->rq.lock);
+               cur_state = qp->state;
+               spin_unlock(&qp->rq.lock);
+               spin_unlock_irq(&qp->sq.lock);
+       }
+
+       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+               mthca_dbg(dev, "Bad QP transition (transport %d) "
+                         "%d->%d with attr 0x%08x\n",
+                         qp->transport, cur_state, new_state,
+                         attr_mask);
+               goto out;
+       }
+
+       if ((attr_mask & IB_QP_PKEY_INDEX) &&
+            attr->pkey_index >= dev->limits.pkey_table_len) {
+               mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+                         attr->pkey_index, dev->limits.pkey_table_len-1);
+               goto out;
+       }
+
+       if ((attr_mask & IB_QP_PORT) &&
+           (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+               mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+               goto out;
+       }
+
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+           attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+               mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+                         attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+               goto out;
+       }
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+           attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+               mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+                         attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+               goto out;
+       }
+
+       if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+               err = 0;
+               goto out;
+       }
+
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+               err = __mthca_modify_qp(ibqp, &dummy_init_attr,
+                                       dummy_init_attr_mask[ibqp->qp_type],
+                                       IB_QPS_RESET, IB_QPS_INIT);
+               if (err)
+                       goto out;
+               cur_state = IB_QPS_INIT;
+       }
+
+       err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
 
 out:
        mutex_unlock(&qp->mutex);
index 87310ee..a0b3782 100644 (file)
@@ -132,12 +132,46 @@ struct ipoib_cm_data {
        __be32 mtu;
 };
 
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * rx_drain_qp before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+       IPOIB_CM_RX_LIVE,
+       IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+       IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
+
 struct ipoib_cm_rx {
        struct ib_cm_id     *id;
        struct ib_qp        *qp;
        struct list_head     list;
        struct net_device   *dev;
        unsigned long        jiffies;
+       enum ipoib_cm_state  state;
 };
 
 struct ipoib_cm_tx {
@@ -165,10 +199,16 @@ struct ipoib_cm_dev_priv {
        struct ib_srq          *srq;
        struct ipoib_cm_rx_buf *srq_ring;
        struct ib_cm_id        *id;
-       struct list_head        passive_ids;
+       struct ib_qp           *rx_drain_qp;   /* generates WR described in 10.3.1 */
+       struct list_head        passive_ids;   /* state: LIVE */
+       struct list_head        rx_error_list; /* state: ERROR */
+       struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
+       struct list_head        rx_drain_list; /* state: FLUSH, drain started */
+       struct list_head        rx_reap_list;  /* state: FLUSH, drain done */
        struct work_struct      start_task;
        struct work_struct      reap_task;
        struct work_struct      skb_task;
+       struct work_struct      rx_reap_task;
        struct delayed_work     stale_task;
        struct sk_buff_head     skb_queue;
        struct list_head        start_list;
@@ -201,15 +241,17 @@ struct ipoib_dev_priv {
        struct list_head multicast_list;
        struct rb_root multicast_tree;
 
-       struct delayed_work pkey_task;
+       struct delayed_work pkey_poll_task;
        struct delayed_work mcast_task;
        struct work_struct flush_task;
        struct work_struct restart_task;
        struct delayed_work ah_reap_task;
+       struct work_struct pkey_event_task;
 
        struct ib_device *ca;
        u8                port;
        u16               pkey;
+       u16               pkey_index;
        struct ib_pd     *pd;
        struct ib_mr     *mr;
        struct ib_cq     *cq;
@@ -333,12 +375,13 @@ struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
 int ipoib_ib_dev_open(struct net_device *dev);
 int ipoib_ib_dev_up(struct net_device *dev);
 int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
 
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_dev_cleanup(struct net_device *dev);
index eec833b..ffec794 100644 (file)
@@ -37,6 +37,7 @@
 #include <net/dst.h>
 #include <net/icmp.h>
 #include <linux/icmpv6.h>
+#include <linux/delay.h>
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 static int data_debug_level;
@@ -62,6 +63,16 @@ struct ipoib_cm_id {
        u32 remote_mtu;
 };
 
+static struct ib_qp_attr ipoib_cm_err_attr = {
+       .qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
+
+static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
+       .wr_id = IPOIB_CM_RX_DRAIN_WRID
+};
+
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
                               struct ib_cm_event *event);
 
@@ -150,11 +161,44 @@ partial_error:
        return NULL;
 }
 
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+{
+       struct ib_recv_wr *bad_wr;
+
+       /* rx_drain_qp send queue depth is 1, so
+        * make sure we have at most 1 outstanding WR. */
+       if (list_empty(&priv->cm.rx_flush_list) ||
+           !list_empty(&priv->cm.rx_drain_list))
+               return;
+
+       if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+               ipoib_warn(priv, "failed to post rx_drain wr\n");
+
+       list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+       struct ipoib_cm_rx *p = ctx;
+       struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+       unsigned long flags;
+
+       if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+               return;
+
+       spin_lock_irqsave(&priv->lock, flags);
+       list_move(&p->list, &priv->cm.rx_flush_list);
+       p->state = IPOIB_CM_RX_FLUSH;
+       ipoib_cm_start_rx_drain(priv);
+       spin_unlock_irqrestore(&priv->lock, flags);
+}
+
 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
                                           struct ipoib_cm_rx *p)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_init_attr attr = {
+               .event_handler = ipoib_cm_rx_event_handler,
                .send_cq = priv->cq, /* does not matter, we never send anything */
                .recv_cq = priv->cq,
                .srq = priv->cm.srq,
@@ -256,6 +300,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 
        cm_id->context = p;
        p->jiffies = jiffies;
+       p->state = IPOIB_CM_RX_LIVE;
        spin_lock_irq(&priv->lock);
        if (list_empty(&priv->cm.passive_ids))
                queue_delayed_work(ipoib_workqueue,
@@ -277,7 +322,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
 {
        struct ipoib_cm_rx *p;
        struct ipoib_dev_priv *priv;
-       int ret;
 
        switch (event->event) {
        case IB_CM_REQ_RECEIVED:
@@ -289,20 +333,9 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
        case IB_CM_REJ_RECEIVED:
                p = cm_id->context;
                priv = netdev_priv(p->dev);
-               spin_lock_irq(&priv->lock);
-               if (list_empty(&p->list))
-                       ret = 0; /* Connection is going away already. */
-               else {
-                       list_del_init(&p->list);
-                       ret = -ECONNRESET;
-               }
-               spin_unlock_irq(&priv->lock);
-               if (ret) {
-                       ib_destroy_qp(p->qp);
-                       kfree(p);
-                       return ret;
-               }
-               return 0;
+               if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+                       ipoib_warn(priv, "unable to move qp to error state\n");
+               /* Fall through */
        default:
                return 0;
        }
@@ -354,8 +387,15 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
                       wr_id, wc->status);
 
        if (unlikely(wr_id >= ipoib_recvq_size)) {
-               ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
-                          wr_id, ipoib_recvq_size);
+               if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+                       spin_lock_irqsave(&priv->lock, flags);
+                       list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+                       ipoib_cm_start_rx_drain(priv);
+                       queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+                       spin_unlock_irqrestore(&priv->lock, flags);
+               } else
+                       ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+                                  wr_id, ipoib_recvq_size);
                return;
        }
 
@@ -374,9 +414,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
                if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
                        spin_lock_irqsave(&priv->lock, flags);
                        p->jiffies = jiffies;
-                       /* Move this entry to list head, but do
-                        * not re-add it if it has been removed. */
-                       if (!list_empty(&p->list))
+                       /* Move this entry to list head, but do not re-add it
+                        * if it has been moved out of list. */
+                       if (p->state == IPOIB_CM_RX_LIVE)
                                list_move(&p->list, &priv->cm.passive_ids);
                        spin_unlock_irqrestore(&priv->lock, flags);
                }
@@ -583,17 +623,43 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
 int ipoib_cm_dev_open(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_qp_init_attr qp_init_attr = {
+               .send_cq = priv->cq,   /* does not matter, we never send anything */
+               .recv_cq = priv->cq,
+               .cap.max_send_wr = 1,  /* FIXME: 0 Seems not to work */
+               .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+               .cap.max_recv_wr = 1,
+               .cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
+               .sq_sig_type = IB_SIGNAL_ALL_WR,
+               .qp_type = IB_QPT_UC,
+       };
        int ret;
 
        if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
                return 0;
 
+       priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
+       if (IS_ERR(priv->cm.rx_drain_qp)) {
+               printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+               ret = PTR_ERR(priv->cm.rx_drain_qp);
+               return ret;
+       }
+
+       /*
+        * We put the QP in error state directly.  This way, a "flush
+        * error" WC will be immediately generated for each WR we post.
+        */
+       ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
+               goto err_qp;
+       }
+
        priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
        if (IS_ERR(priv->cm.id)) {
                printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
                ret = PTR_ERR(priv->cm.id);
-               priv->cm.id = NULL;
-               return ret;
+               goto err_cm;
        }
 
        ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
@@ -601,35 +667,79 @@ int ipoib_cm_dev_open(struct net_device *dev)
        if (ret) {
                printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
                       IPOIB_CM_IETF_ID | priv->qp->qp_num);
-               ib_destroy_cm_id(priv->cm.id);
-               priv->cm.id = NULL;
-               return ret;
+               goto err_listen;
        }
+
        return 0;
+
+err_listen:
+       ib_destroy_cm_id(priv->cm.id);
+err_cm:
+       priv->cm.id = NULL;
+err_qp:
+       ib_destroy_qp(priv->cm.rx_drain_qp);
+       return ret;
 }
 
 void ipoib_cm_dev_stop(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       struct ipoib_cm_rx *p;
+       struct ipoib_cm_rx *p, *n;
+       unsigned long begin;
+       LIST_HEAD(list);
+       int ret;
 
        if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
                return;
 
        ib_destroy_cm_id(priv->cm.id);
        priv->cm.id = NULL;
+
        spin_lock_irq(&priv->lock);
        while (!list_empty(&priv->cm.passive_ids)) {
                p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
-               list_del_init(&p->list);
+               list_move(&p->list, &priv->cm.rx_error_list);
+               p->state = IPOIB_CM_RX_ERROR;
                spin_unlock_irq(&priv->lock);
+               ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+               if (ret)
+                       ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+               spin_lock_irq(&priv->lock);
+       }
+
+       /* Wait for all RX to be drained */
+       begin = jiffies;
+
+       while (!list_empty(&priv->cm.rx_error_list) ||
+              !list_empty(&priv->cm.rx_flush_list) ||
+              !list_empty(&priv->cm.rx_drain_list)) {
+               if (!time_after(jiffies, begin + 5 * HZ)) {
+                       ipoib_warn(priv, "RX drain timing out\n");
+
+                       /*
+                        * assume the HW is wedged and just free up everything.
+                        */
+                       list_splice_init(&priv->cm.rx_flush_list, &list);
+                       list_splice_init(&priv->cm.rx_error_list, &list);
+                       list_splice_init(&priv->cm.rx_drain_list, &list);
+                       break;
+               }
+               spin_unlock_irq(&priv->lock);
+               msleep(1);
+               spin_lock_irq(&priv->lock);
+       }
+
+       list_splice_init(&priv->cm.rx_reap_list, &list);
+
+       spin_unlock_irq(&priv->lock);
+
+       list_for_each_entry_safe(p, n, &list, list) {
                ib_destroy_cm_id(p->id);
                ib_destroy_qp(p->qp);
                kfree(p);
-               spin_lock_irq(&priv->lock);
        }
-       spin_unlock_irq(&priv->lock);
 
+       ib_destroy_qp(priv->cm.rx_drain_qp);
        cancel_delayed_work(&priv->cm.stale_task);
 }
 
@@ -1079,24 +1189,44 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
                queue_work(ipoib_workqueue, &priv->cm.skb_task);
 }
 
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+                                                  cm.rx_reap_task);
+       struct ipoib_cm_rx *p, *n;
+       LIST_HEAD(list);
+
+       spin_lock_irq(&priv->lock);
+       list_splice_init(&priv->cm.rx_reap_list, &list);
+       spin_unlock_irq(&priv->lock);
+
+       list_for_each_entry_safe(p, n, &list, list) {
+               ib_destroy_cm_id(p->id);
+               ib_destroy_qp(p->qp);
+               kfree(p);
+       }
+}
+
 static void ipoib_cm_stale_task(struct work_struct *work)
 {
        struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
                                                   cm.stale_task.work);
        struct ipoib_cm_rx *p;
+       int ret;
 
        spin_lock_irq(&priv->lock);
        while (!list_empty(&priv->cm.passive_ids)) {
-               /* List if sorted by LRU, start from tail,
+               /* List is sorted by LRU, start from tail,
                 * stop when we see a recently used entry */
                p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
                if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
                        break;
-               list_del_init(&p->list);
+               list_move(&p->list, &priv->cm.rx_error_list);
+               p->state = IPOIB_CM_RX_ERROR;
                spin_unlock_irq(&priv->lock);
-               ib_destroy_cm_id(p->id);
-               ib_destroy_qp(p->qp);
-               kfree(p);
+               ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+               if (ret)
+                       ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
                spin_lock_irq(&priv->lock);
        }
 
@@ -1164,9 +1294,14 @@ int ipoib_cm_dev_init(struct net_device *dev)
        INIT_LIST_HEAD(&priv->cm.passive_ids);
        INIT_LIST_HEAD(&priv->cm.reap_list);
        INIT_LIST_HEAD(&priv->cm.start_list);
+       INIT_LIST_HEAD(&priv->cm.rx_error_list);
+       INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+       INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+       INIT_LIST_HEAD(&priv->cm.rx_reap_list);
        INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
        INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
        INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+       INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
        INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
 
        skb_queue_head_init(&priv->cm.skb_queue);
index 68d72c6..c1aad06 100644 (file)
@@ -448,6 +448,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int ret;
 
+       if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
+               ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
+               clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+               return -1;
+       }
+       set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
        ret = ipoib_init_qp(dev);
        if (ret) {
                ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
@@ -457,14 +464,14 @@ int ipoib_ib_dev_open(struct net_device *dev)
        ret = ipoib_ib_post_receives(dev);
        if (ret) {
                ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-               ipoib_ib_dev_stop(dev);
+               ipoib_ib_dev_stop(dev, 1);
                return -1;
        }
 
        ret = ipoib_cm_dev_open(dev);
        if (ret) {
-               ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-               ipoib_ib_dev_stop(dev);
+               ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+               ipoib_ib_dev_stop(dev, 1);
                return -1;
        }
 
@@ -516,7 +523,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
        if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
                mutex_lock(&pkey_mutex);
                set_bit(IPOIB_PKEY_STOP, &priv->flags);
-               cancel_delayed_work(&priv->pkey_task);
+               cancel_delayed_work(&priv->pkey_poll_task);
                mutex_unlock(&pkey_mutex);
                if (flush)
                        flush_workqueue(ipoib_workqueue);
@@ -543,7 +550,7 @@ static int recvs_pending(struct net_device *dev)
        return pending;
 }
 
-int ipoib_ib_dev_stop(struct net_device *dev)
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_attr qp_attr;
@@ -629,7 +636,8 @@ timeout:
        /* Wait for all AHs to be reaped */
        set_bit(IPOIB_STOP_REAPER, &priv->flags);
        cancel_delayed_work(&priv->ah_reap_task);
-       flush_workqueue(ipoib_workqueue);
+       if (flush)
+               flush_workqueue(ipoib_workqueue);
 
        begin = jiffies;
 
@@ -673,13 +681,24 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
        return 0;
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 {
-       struct ipoib_dev_priv *cpriv, *priv =
-               container_of(work, struct ipoib_dev_priv, flush_task);
+       struct ipoib_dev_priv *cpriv;
        struct net_device *dev = priv->dev;
+       u16 new_index;
+
+       mutex_lock(&priv->vlan_mutex);
 
-       if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) {
+       /*
+        * Flush any child interfaces too -- they might be up even if
+        * the parent is down.
+        */
+       list_for_each_entry(cpriv, &priv->child_intfs, list)
+               __ipoib_ib_dev_flush(cpriv, pkey_event);
+
+       mutex_unlock(&priv->vlan_mutex);
+
+       if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
                return;
        }
@@ -689,10 +708,32 @@ void ipoib_ib_dev_flush(struct work_struct *work)
                return;
        }
 
+       if (pkey_event) {
+               if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+                       clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+                       ipoib_ib_dev_down(dev, 0);
+                       ipoib_pkey_dev_delay_open(dev);
+                       return;
+               }
+               set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+               /* restart QP only if P_Key index is changed */
+               if (new_index == priv->pkey_index) {
+                       ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+                       return;
+               }
+               priv->pkey_index = new_index;
+       }
+
        ipoib_dbg(priv, "flushing\n");
 
        ipoib_ib_dev_down(dev, 0);
 
+       if (pkey_event) {
+               ipoib_ib_dev_stop(dev, 0);
+               ipoib_ib_dev_open(dev);
+       }
+
        /*
         * The device could have been brought down between the start and when
         * we get here, don't bring it back up if it's not configured up
@@ -701,14 +742,24 @@ void ipoib_ib_dev_flush(struct work_struct *work)
                ipoib_ib_dev_up(dev);
                ipoib_mcast_restart_task(&priv->restart_task);
        }
+}
 
-       mutex_lock(&priv->vlan_mutex);
+void ipoib_ib_dev_flush(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv =
+               container_of(work, struct ipoib_dev_priv, flush_task);
 
-       /* Flush any child interfaces too */
-       list_for_each_entry(cpriv, &priv->child_intfs, list)
-               ipoib_ib_dev_flush(&cpriv->flush_task);
+       ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
+       __ipoib_ib_dev_flush(priv, 0);
+}
 
-       mutex_unlock(&priv->vlan_mutex);
+void ipoib_pkey_event(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv =
+               container_of(work, struct ipoib_dev_priv, pkey_event_task);
+
+       ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
+       __ipoib_ib_dev_flush(priv, 1);
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)
@@ -736,7 +787,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
 void ipoib_pkey_poll(struct work_struct *work)
 {
        struct ipoib_dev_priv *priv =
-               container_of(work, struct ipoib_dev_priv, pkey_task.work);
+               container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
        struct net_device *dev = priv->dev;
 
        ipoib_pkey_dev_check_presence(dev);
@@ -747,7 +798,7 @@ void ipoib_pkey_poll(struct work_struct *work)
                mutex_lock(&pkey_mutex);
                if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
                        queue_delayed_work(ipoib_workqueue,
-                                          &priv->pkey_task,
+                                          &priv->pkey_poll_task,
                                           HZ);
                mutex_unlock(&pkey_mutex);
        }
@@ -766,7 +817,7 @@ int ipoib_pkey_dev_delay_open(struct net_device *dev)
                mutex_lock(&pkey_mutex);
                clear_bit(IPOIB_PKEY_STOP, &priv->flags);
                queue_delayed_work(ipoib_workqueue,
-                                  &priv->pkey_task,
+                                  &priv->pkey_poll_task,
                                   HZ);
                mutex_unlock(&pkey_mutex);
                return 1;
index 0a428f2..894b1dc 100644 (file)
@@ -107,7 +107,7 @@ int ipoib_open(struct net_device *dev)
                return -EINVAL;
 
        if (ipoib_ib_dev_up(dev)) {
-               ipoib_ib_dev_stop(dev);
+               ipoib_ib_dev_stop(dev, 1);
                return -EINVAL;
        }
 
@@ -152,7 +152,7 @@ static int ipoib_stop(struct net_device *dev)
        flush_workqueue(ipoib_workqueue);
 
        ipoib_ib_dev_down(dev, 1);
-       ipoib_ib_dev_stop(dev);
+       ipoib_ib_dev_stop(dev, 1);
 
        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
                struct ipoib_dev_priv *cpriv;
@@ -988,7 +988,8 @@ static void ipoib_setup(struct net_device *dev)
        INIT_LIST_HEAD(&priv->dead_ahs);
        INIT_LIST_HEAD(&priv->multicast_list);
 
-       INIT_DELAYED_WORK(&priv->pkey_task,    ipoib_pkey_poll);
+       INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+       INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
        INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
index 54fbead..aae3670 100644 (file)
@@ -524,7 +524,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
                return;
 
        if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
-               ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+               ipoib_warn(priv, "ib_query_gid() failed\n");
        else
                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
index 5c3c6a4..982eb88 100644 (file)
@@ -33,8 +33,6 @@
  * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
-#include <rdma/ib_cache.h>
-
 #include "ipoib.h"
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
@@ -49,7 +47,7 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
        if (!qp_attr)
                goto out;
 
-       if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+       if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
                clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
                ret = -ENXIO;
                goto out;
@@ -94,26 +92,16 @@ int ipoib_init_qp(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int ret;
-       u16 pkey_index;
        struct ib_qp_attr qp_attr;
        int attr_mask;
 
-       /*
-        * Search through the port P_Key table for the requested pkey value.
-        * The port has to be assigned to the respective IB partition in
-        * advance.
-        */
-       ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
-       if (ret) {
-               clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
-               return ret;
-       }
-       set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+       if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+               return -1;
 
        qp_attr.qp_state = IB_QPS_INIT;
        qp_attr.qkey = 0;
        qp_attr.port_num = priv->port;
-       qp_attr.pkey_index = pkey_index;
+       qp_attr.pkey_index = priv->pkey_index;
        attr_mask =
            IB_QP_QKEY |
            IB_QP_PORT |
@@ -185,7 +173,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        size = ipoib_sendq_size + ipoib_recvq_size + 1;
        ret = ipoib_cm_dev_init(dev);
        if (!ret)
-               size += ipoib_recvq_size;
+               size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
 
        priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
        if (IS_ERR(priv->cq)) {
@@ -259,14 +247,18 @@ void ipoib_event(struct ib_event_handler *handler,
        struct ipoib_dev_priv *priv =
                container_of(handler, struct ipoib_dev_priv, event_handler);
 
-       if ((record->event == IB_EVENT_PORT_ERR    ||
-            record->event == IB_EVENT_PKEY_CHANGE ||
-            record->event == IB_EVENT_PORT_ACTIVE ||
-            record->event == IB_EVENT_LID_CHANGE  ||
-            record->event == IB_EVENT_SM_CHANGE   ||
-            record->event == IB_EVENT_CLIENT_REREGISTER) &&
-           record->element.port_num == priv->port) {
+       if (record->element.port_num != priv->port)
+               return;
+
+       if (record->event == IB_EVENT_PORT_ERR    ||
+           record->event == IB_EVENT_PORT_ACTIVE ||
+           record->event == IB_EVENT_LID_CHANGE  ||
+           record->event == IB_EVENT_SM_CHANGE   ||
+           record->event == IB_EVENT_CLIENT_REREGISTER) {
                ipoib_dbg(priv, "Port state change event\n");
                queue_work(ipoib_workqueue, &priv->flush_task);
+       } else if (record->event == IB_EVENT_PKEY_CHANGE) {
+               ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
+               queue_work(ipoib_workqueue, &priv->pkey_event_task);
        }
 }
index c427173..cfa5cc0 100644 (file)
@@ -90,7 +90,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
        int i;
 
        mlx4_dbg(dev, "DEV_CAP flags:\n");
-       for (i = 0; i < 32; ++i)
+       for (i = 0; i < ARRAY_SIZE(fname); ++i)
                if (fname[i] && (flags & (1 << i)))
                        mlx4_dbg(dev, "    %s\n", fname[i]);
 }
index 47cefca..0627a6a 100644 (file)
@@ -890,6 +890,8 @@ struct ib_device {
        spinlock_t                    client_data_lock;
 
        struct ib_cache               cache;
+       int                          *pkey_tbl_len;
+       int                          *gid_tbl_len;
 
        u32                           flags;
 
@@ -1118,6 +1120,12 @@ int ib_modify_port(struct ib_device *device,
                   u8 port_num, int port_modify_mask,
                   struct ib_port_modify *port_modify);
 
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+               u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+                u8 port_num, u16 pkey, u16 *index);
+
 /**
  * ib_alloc_pd - Allocates an unused protection domain.
  * @device: The device on which to allocate the protection domain.