From d47effe1be0c4fc983306a9c704632e3a087eed8 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Tue, 1 Mar 2011 17:06:37 +0530 Subject: vhost: Cleanup vhost.c and net.c Minor cleanup of vhost.c and net.c to match coding style. Signed-off-by: Krishna Kumar Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 19 +++++++++++------- drivers/vhost/vhost.c | 53 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f616cef..59dad9f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -60,6 +60,7 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to, { int seg = 0; size_t size; + while (len && seg < iov_count) { size = min(from->iov_len, len); to->iov_base = from->iov_base; @@ -79,6 +80,7 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, { int seg = 0; size_t size; + while (len && seg < iovcount) { size = min(from->iov_len, len); to->iov_base = from->iov_base; @@ -296,17 +298,16 @@ static void handle_rx_big(struct vhost_net *net) .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; - struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; - size_t len, total_len = 0; int err; size_t hdr_size; /* TODO: check that we are running from vhost_worker? */ struct socket *sock = rcu_dereference_check(vq->private_data, 1); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) return; @@ -405,18 +406,17 @@ static void handle_rx_mergeable(struct vhost_net *net) .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; - struct virtio_net_hdr_mrg_rxbuf hdr = { .hdr.flags = 0, .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE }; - size_t total_len = 0; int err, headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; /* TODO: check that we are running from vhost_worker? */ struct socket *sock = rcu_dereference_check(vq->private_data, 1); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) return; @@ -654,6 +654,7 @@ static struct socket *get_raw_socket(int fd) } uaddr; int uaddr_len = sizeof uaddr, r; struct socket *sock = sockfd_lookup(fd, &r); + if (!sock) return ERR_PTR(-ENOTSOCK); @@ -682,6 +683,7 @@ static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; + if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); @@ -696,6 +698,7 @@ static struct socket *get_tap_socket(int fd) static struct socket *get_socket(int fd) { struct socket *sock; + /* special case to disable backend */ if (fd == -1) return NULL; @@ -741,9 +744,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) oldsock = rcu_dereference_protected(vq->private_data, lockdep_is_held(&vq->mutex)); if (sock != oldsock) { - vhost_net_disable_vq(n, vq); - rcu_assign_pointer(vq->private_data, sock); - vhost_net_enable_vq(n, vq); + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, sock); + vhost_net_enable_vq(n, vq); } mutex_unlock(&vq->mutex); @@ -768,6 +771,7 @@ static long vhost_net_reset_owner(struct vhost_net *n) struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; + mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) @@ -829,6 +833,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, struct vhost_vring_file backend; u64 features; int r; + switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ade0568..b0cc7f8 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -41,8 +41,8 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct vhost_poll *poll; - poll = container_of(pt, struct vhost_poll, table); + poll = container_of(pt, struct vhost_poll, table); poll->wqh = wqh; add_wait_queue(wqh, &poll->wait); } @@ -85,6 +85,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, void vhost_poll_start(struct vhost_poll *poll, struct file *file) { unsigned long mask; + mask = file->f_op->poll(file, &poll->table); if (mask) vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); @@ -101,6 +102,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) { int left; + spin_lock_irq(&dev->work_lock); left = seq - work->done_seq; spin_unlock_irq(&dev->work_lock); @@ -222,6 +224,7 @@ static int vhost_worker(void *data) static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * UIO_MAXIOV, GFP_KERNEL); @@ -235,6 +238,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) goto err_nomem; } return 0; + err_nomem: for (; i >= 0; --i) { kfree(dev->vqs[i].indirect); @@ -247,6 +251,7 @@ err_nomem: static void vhost_dev_free_iovecs(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->nvqs; ++i) { kfree(dev->vqs[i].indirect); dev->vqs[i].indirect = NULL; @@ -296,26 +301,28 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; + struct vhost_work work; + struct task_struct *owner; + int ret; }; static void vhost_attach_cgroups_work(struct vhost_work *work) { - struct vhost_attach_cgroups_struct *s; - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s->ret = cgroup_attach_task_all(s->owner, current); } static int vhost_attach_cgroups(struct vhost_dev *dev) { - struct vhost_attach_cgroups_struct attach; - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); - return attach.ret; + struct vhost_attach_cgroups_struct attach; + + attach.owner = current; + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_work_queue(dev, &attach.work); + vhost_work_flush(dev, &attach.work); + return attach.ret; } /* Caller should have device mutex */ @@ -323,11 +330,13 @@ static long vhost_dev_set_owner(struct vhost_dev *dev) { struct task_struct *worker; int err; + /* Is there an owner already? */ if (dev->mm) { err = -EBUSY; goto err_mm; } + /* No owner, become one */ dev->mm = get_task_mm(current); worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); @@ -380,6 +389,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev) void vhost_dev_cleanup(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { vhost_poll_stop(&dev->vqs[i].poll); @@ -421,6 +431,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; + /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) @@ -461,6 +472,7 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, int log_all) { int i; + for (i = 0; i < d->nvqs; ++i) { int ok; mutex_lock(&d->vqs[i].mutex); @@ -527,6 +539,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem, *oldmem; unsigned long size = offsetof(struct vhost_memory, regions); + if (copy_from_user(&mem, m, size)) return -EFAULT; if (mem.padding) @@ -544,7 +557,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return -EFAULT; } - if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) { + if (!memory_access_ok(d, newmem, + vhost_has_feature(d, VHOST_F_LOG_ALL))) { kfree(newmem); return -EFAULT; } @@ -560,6 +574,7 @@ static int init_used(struct vhost_virtqueue *vq, struct vring_used __user *used) { int r = put_user(vq->used_flags, &used->flags); + if (r) return r; return get_user(vq->last_used_idx, &used->idx); @@ -849,6 +864,7 @@ static const struct vhost_memory_region *find_region(struct vhost_memory *mem, { struct vhost_memory_region *reg; int i; + /* linear search is not brilliant, but we really have on the order of 6 * regions in practice */ for (i = 0; i < mem->nregions; ++i) { @@ -871,6 +887,7 @@ static int set_bit_to_user(int nr, void __user *addr) void *base; int bit = nr + (log % PAGE_SIZE) * 8; int r; + r = get_user_pages_fast(log, 1, 1, &page); if (r < 0) return r; @@ -888,6 +905,7 @@ static int log_write(void __user *log_base, { u64 write_page = write_address / VHOST_PAGE_SIZE; int r; + if (!write_length) return 0; write_length += write_address % VHOST_PAGE_SIZE; @@ -1037,8 +1055,8 @@ static int get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq, i, count); return -EINVAL; } - if (unlikely(memcpy_fromiovec((unsigned char *)&desc, vq->indirect, - sizeof desc))) { + if (unlikely(memcpy_fromiovec((unsigned char *)&desc, + vq->indirect, sizeof desc))) { vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", i, (size_t)indirect->addr + i * sizeof desc); return -EINVAL; @@ -1317,6 +1335,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { __u16 flags; + /* Flush out used index updates. This is paired * with the barrier that the Guest executes when enabling * interrupts. */ @@ -1361,6 +1380,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) { u16 avail_idx; int r; + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) return false; vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; @@ -1387,6 +1407,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) void vhost_disable_notify(struct vhost_virtqueue *vq) { int r; + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) return; vq->used_flags |= VRING_USED_F_NO_NOTIFY; -- cgit v1.1 From fcc042a2806064ffcaed7a0c5cb710eca0e99108 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 6 Mar 2011 13:33:49 +0200 Subject: vhost: copy_from_user -> __copy_from_user copy_from_user is pretty high on perf top profile, replacing it with __copy_from_user helps. It's also safe because we do access_ok checks during setup. Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b0cc7f8..2ab2912 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1171,7 +1171,7 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, i, vq->num, head); return -EINVAL; } - ret = copy_from_user(&desc, vq->desc + i, sizeof desc); + ret = __copy_from_user(&desc, vq->desc + i, sizeof desc); if (unlikely(ret)) { vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", i, vq->desc + i); -- cgit v1.1 From cfbdab951369f15de890597530076bf0119361be Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 17 Jan 2011 16:10:59 +0800 Subject: vhost-net: check the support of mergeable buffer outside the receive loop No need to check the support of mergeable buffer inside the recevie loop as the whole handle_rx()_xx is in the read critical region. So this patch move it ahead of the receiving loop. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 59dad9f..9f57cd4 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net) .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; - int err, headcount; + int err, headcount, mergeable; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; /* TODO: check that we are running from vhost_worker? */ @@ -427,6 +427,7 @@ static void handle_rx_mergeable(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; + mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF); while ((sock_len = peek_head_len(sock->sk))) { sock_len += sock_hlen; @@ -476,7 +477,7 @@ static void handle_rx_mergeable(struct vhost_net *net) break; } /* TODO: Should check and handle checksum. */ - if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) && + if (likely(mergeable) && memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, offsetof(typeof(hdr), num_buffers), sizeof hdr.num_buffers)) { -- cgit v1.1 From 94249369e9930276e30087da205349a55478cbb5 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 17 Jan 2011 16:11:08 +0800 Subject: vhost-net: Unify the code of mergeable and big buffer handling Codes duplication were found between the handling of mergeable and big buffers, so this patch tries to unify them. This could be easily done by adding a quota to the get_rx_bufs() which is used to limit the number of buffers it returns (for mergeable buffer, the quota is simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the previous handle_rx_mergeable() could be resued also for big buffers. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 128 +++------------------------------------------------- 1 file changed, 7 insertions(+), 121 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9f57cd4..0329c41 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -229,6 +229,7 @@ static int peek_head_len(struct sock *sk) * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset + * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, @@ -236,7 +237,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, int datalen, unsigned *iovcount, struct vhost_log *log, - unsigned *log_num) + unsigned *log_num, + unsigned int quota) { unsigned int out, in; int seg = 0; @@ -244,7 +246,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, unsigned d; int r, nlogs = 0; - while (datalen > 0) { + while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; @@ -284,116 +286,7 @@ err: /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ -static void handle_rx_big(struct vhost_net *net) -{ - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; - unsigned out, in, log, s; - int head; - struct vhost_log *vq_log; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_control = NULL, /* FIXME: get and handle RX aux data. */ - .msg_controllen = 0, - .msg_iov = vq->iov, - .msg_flags = MSG_DONTWAIT, - }; - struct virtio_net_hdr hdr = { - .flags = 0, - .gso_type = VIRTIO_NET_HDR_GSO_NONE - }; - size_t len, total_len = 0; - int err; - size_t hdr_size; - /* TODO: check that we are running from vhost_worker? */ - struct socket *sock = rcu_dereference_check(vq->private_data, 1); - - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) - return; - - mutex_lock(&vq->mutex); - vhost_disable_notify(vq); - hdr_size = vq->vhost_hlen; - - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? - vq->log : NULL; - - for (;;) { - head = vhost_get_vq_desc(&net->dev, vq, vq->iov, - ARRAY_SIZE(vq->iov), - &out, &in, - vq_log, &log); - /* On error, stop handling until the next kick. */ - if (unlikely(head < 0)) - break; - /* OK, now we need to know about added descriptors. */ - if (head == vq->num) { - if (unlikely(vhost_enable_notify(vq))) { - /* They have slipped one in as we were - * doing that: check again. */ - vhost_disable_notify(vq); - continue; - } - /* Nothing new? Wait for eventfd to tell us - * they refilled. */ - break; - } - /* We don't need to be notified again. */ - if (out) { - vq_err(vq, "Unexpected descriptor format for RX: " - "out %d, int %d\n", - out, in); - break; - } - /* Skip header. TODO: support TSO/mergeable rx buffers. */ - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); - msg.msg_iovlen = in; - len = iov_length(vq->iov, in); - /* Sanity check */ - if (!len) { - vq_err(vq, "Unexpected header len for RX: " - "%zd expected %zd\n", - iov_length(vq->hdr, s), hdr_size); - break; - } - err = sock->ops->recvmsg(NULL, sock, &msg, - len, MSG_DONTWAIT | MSG_TRUNC); - /* TODO: Check specific error and bomb out unless EAGAIN? */ - if (err < 0) { - vhost_discard_vq_desc(vq, 1); - break; - } - /* TODO: Should check and handle checksum. */ - if (err > len) { - pr_debug("Discarded truncated rx packet: " - " len %d > %zd\n", err, len); - vhost_discard_vq_desc(vq, 1); - continue; - } - len = err; - err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); - if (err) { - vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", - vq->iov->iov_base, err); - break; - } - len += hdr_size; - vhost_add_used_and_signal(&net->dev, vq, head, len); - if (unlikely(vq_log)) - vhost_log_write(vq, vq_log, log, len); - total_len += len; - if (unlikely(total_len >= VHOST_NET_WEIGHT)) { - vhost_poll_queue(&vq->poll); - break; - } - } - - mutex_unlock(&vq->mutex); -} - -/* Expects to be always run from workqueue - which acts as - * read-size critical section for our kind of RCU. */ -static void handle_rx_mergeable(struct vhost_net *net) +static void handle_rx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; unsigned uninitialized_var(in), log; @@ -433,7 +326,8 @@ static void handle_rx_mergeable(struct vhost_net *net) sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads, vhost_len, - &in, vq_log, &log); + &in, vq_log, &log, + likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) break; @@ -499,14 +393,6 @@ static void handle_rx_mergeable(struct vhost_net *net) mutex_unlock(&vq->mutex); } -static void handle_rx(struct vhost_net *net) -{ - if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) - handle_rx_mergeable(net); - else - handle_rx_big(net); -} - static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, -- cgit v1.1 From 783e3988544b94ff3918666b9f36866ac547fba1 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 17 Jan 2011 16:11:17 +0800 Subject: vhost: lock receive queue, not the socket vhost takes a sock lock to try and prevent the skb from being pulled from the receive queue after skb_peek. However this is not the right lock to use for that, sk_receive_queue.lock is. Fix that up. Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 0329c41..5720301 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -213,12 +213,13 @@ static int peek_head_len(struct sock *sk) { struct sk_buff *head; int len = 0; + unsigned long flags; - lock_sock(sk); + spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); - if (head) + if (likely(head)) len = head->len; - release_sock(sk); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } -- cgit v1.1 From de4d768a428d9de943dd6dc82bcd61742955cb6e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 13 Mar 2011 23:00:52 +0200 Subject: vhost-net: remove unlocked use of receive_queue Use of skb_queue_empty(&sock->sk->sk_receive_queue) without taking the sk_receive_queue.lock is unsafe or useless. Take it out. Reported-by: Eric Dumazet Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 5720301..2f7c76a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -311,7 +311,7 @@ static void handle_rx(struct vhost_net *net) /* TODO: check that we are running from vhost_worker? */ struct socket *sock = rcu_dereference_check(vq->private_data, 1); - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + if (!sock) return; mutex_lock(&vq->mutex); -- cgit v1.1 From 4363c2fddb1399b728ef21ee8101c148a311ea45 Mon Sep 17 00:00:00 2001 From: Alex Dubov Date: Wed, 16 Mar 2011 17:57:13 +0000 Subject: gianfar: Fall back to software tcp/udp checksum on older controllers As specified by errata eTSEC49 of MPC8548 and errata eTSEC12 of MPC83xx, older revisions of gianfar controllers will be unable to calculate a TCP/UDP packet checksum for some alignments of the appropriate FCB. This patch checks for FCB alignment on such controllers and falls back to software checksumming if the alignment is known to be bad. Signed-off-by: Alex Dubov Signed-off-by: David S. Miller --- drivers/net/gianfar.c | 16 ++++++++++++++-- drivers/net/gianfar.h | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index ccb231c..2a0ad9a 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -949,6 +949,11 @@ static void gfar_detect_errata(struct gfar_private *priv) (pvr == 0x80861010 && (mod & 0xfff9) == 0x80c0)) priv->errata |= GFAR_ERRATA_A002; + /* MPC8313 Rev < 2.0, MPC8548 rev 2.0 */ + if ((pvr == 0x80850010 && mod == 0x80b0 && rev < 0x0020) || + (pvr == 0x80210020 && mod == 0x8030 && rev == 0x0020)) + priv->errata |= GFAR_ERRATA_12; + if (priv->errata) dev_info(dev, "enabled errata workarounds, flags: 0x%x\n", priv->errata); @@ -2154,8 +2159,15 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) /* Set up checksumming */ if (CHECKSUM_PARTIAL == skb->ip_summed) { fcb = gfar_add_fcb(skb); - lstatus |= BD_LFLAG(TXBD_TOE); - gfar_tx_checksum(skb, fcb); + /* as specified by errata */ + if (unlikely(gfar_has_errata(priv, GFAR_ERRATA_12) + && ((unsigned long)fcb % 0x20) > 0x18)) { + __skb_pull(skb, GMAC_FCB_LEN); + skb_checksum_help(skb); + } else { + lstatus |= BD_LFLAG(TXBD_TOE); + gfar_tx_checksum(skb, fcb); + } } if (vlan_tx_tag_present(skb)) { diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h index 54de413..ec5d595 100644 --- a/drivers/net/gianfar.h +++ b/drivers/net/gianfar.h @@ -1039,6 +1039,7 @@ enum gfar_errata { GFAR_ERRATA_74 = 0x01, GFAR_ERRATA_76 = 0x02, GFAR_ERRATA_A002 = 0x04, + GFAR_ERRATA_12 = 0x08, /* a.k.a errata eTSEC49 */ }; /* Struct stolen almost completely (and shamelessly) from the FCC enet source -- cgit v1.1 From 67c5c6cb8129c595f21e88254a3fc6b3b841ae8e Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Thu, 17 Mar 2011 01:40:10 +0000 Subject: econet: 4 byte infoleak to the network struct aunhdr has 4 padding bytes between 'pad' and 'handle' fields on x86_64. These bytes are not initialized in the variable 'ah' before sending 'ah' to the network. This leads to 4 bytes kernel stack infoleak. This bug was introduced before the git epoch. Signed-off-by: Vasiliy Kulikov Acked-by: Phil Blundell Signed-off-by: David S. Miller --- net/econet/af_econet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 0c28263..116d3fd 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -435,10 +435,10 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, udpdest.sin_addr.s_addr = htonl(network | addr.station); } + memset(&ah, 0, sizeof(ah)); ah.port = port; ah.cb = cb & 0x7f; ah.code = 2; /* magic */ - ah.pad = 0; /* tack our header on the front of the iovec */ size = sizeof(struct aunhdr); -- cgit v1.1 From 5e5069b41d5b82bcadc1dbf73f48476b428c102f Mon Sep 17 00:00:00 2001 From: Roger Luethi Date: Thu, 17 Mar 2011 06:37:21 +0000 Subject: ethtool: __ethtool_set_sg: check for function pointer before using it __ethtool_set_sg does not check if dev->ethtool_ops->set_sg is defined which can result in a NULL pointer dereference when ethtool is used to change SG settings for drivers without SG support. Signed-off-by: Roger Luethi Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c1a71bb..a1086fb 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1457,6 +1457,9 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) { int err; + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + if (data && !(dev->features & NETIF_F_ALL_CSUM)) return -EINVAL; -- cgit v1.1 From 3a7da39d165e0c363c294feec119db1427032afd Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 17 Mar 2011 07:34:32 +0000 Subject: ethtool: Compat handling for struct ethtool_rxnfc This structure was accidentally defined such that its layout can differ between 32-bit and 64-bit processes. Add compat structure definitions and an ioctl wrapper function. Signed-off-by: Ben Hutchings Acked-by: Alexander Duyck Cc: stable@kernel.org [2.6.30+] Signed-off-by: David S. Miller --- include/linux/ethtool.h | 34 +++++++++++++++ net/socket.c | 114 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 141 insertions(+), 7 deletions(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index aac3e2e..b297f28 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -13,6 +13,9 @@ #ifndef _LINUX_ETHTOOL_H #define _LINUX_ETHTOOL_H +#ifdef __KERNEL__ +#include +#endif #include #include @@ -450,6 +453,37 @@ struct ethtool_rxnfc { __u32 rule_locs[0]; }; +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT + +struct compat_ethtool_rx_flow_spec { + u32 flow_type; + union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethhdr ether_spec; + u8 hdata[72]; + } h_u, m_u; + compat_u64 ring_cookie; + u32 location; +}; + +struct compat_ethtool_rxnfc { + u32 cmd; + u32 flow_type; + compat_u64 data; + struct compat_ethtool_rx_flow_spec fs; + u32 rule_cnt; + u32 rule_locs[0]; +}; + +#endif /* CONFIG_COMPAT */ +#endif /* __KERNEL__ */ + /** * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR diff --git a/net/socket.c b/net/socket.c index 937d0fc..5212447 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2588,23 +2588,123 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) { + struct compat_ethtool_rxnfc __user *compat_rxnfc; + bool convert_in = false, convert_out = false; + size_t buf_size = ALIGN(sizeof(struct ifreq), 8); + struct ethtool_rxnfc __user *rxnfc; struct ifreq __user *ifr; + u32 rule_cnt = 0, actual_rule_cnt; + u32 ethcmd; u32 data; - void __user *datap; + int ret; + + if (get_user(data, &ifr32->ifr_ifru.ifru_data)) + return -EFAULT; - ifr = compat_alloc_user_space(sizeof(*ifr)); + compat_rxnfc = compat_ptr(data); - if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + if (get_user(ethcmd, &compat_rxnfc->cmd)) return -EFAULT; - if (get_user(data, &ifr32->ifr_ifru.ifru_data)) + /* Most ethtool structures are defined without padding. + * Unfortunately struct ethtool_rxnfc is an exception. + */ + switch (ethcmd) { + default: + break; + case ETHTOOL_GRXCLSRLALL: + /* Buffer size is variable */ + if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) + return -EFAULT; + if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) + return -ENOMEM; + buf_size += rule_cnt * sizeof(u32); + /* fall through */ + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + convert_out = true; + /* fall through */ + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + buf_size += sizeof(struct ethtool_rxnfc); + convert_in = true; + break; + } + + ifr = compat_alloc_user_space(buf_size); + rxnfc = (void *)ifr + ALIGN(sizeof(struct ifreq), 8); + + if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) return -EFAULT; - datap = compat_ptr(data); - if (put_user(datap, &ifr->ifr_ifru.ifru_data)) + if (put_user(convert_in ? rxnfc : compat_ptr(data), + &ifr->ifr_ifru.ifru_data)) return -EFAULT; - return dev_ioctl(net, SIOCETHTOOL, ifr); + if (convert_in) { + /* We expect there to be holes between fs.m_u and + * fs.ring_cookie and at the end of fs, but nowhere else. + */ + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) + + sizeof(compat_rxnfc->fs.m_u) != + offsetof(struct ethtool_rxnfc, fs.m_u) + + sizeof(rxnfc->fs.m_u)); + BUILD_BUG_ON( + offsetof(struct compat_ethtool_rxnfc, fs.location) - + offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != + offsetof(struct ethtool_rxnfc, fs.location) - + offsetof(struct ethtool_rxnfc, fs.ring_cookie)); + + if (copy_in_user(rxnfc, compat_rxnfc, + (void *)(&rxnfc->fs.m_u + 1) - + (void *)rxnfc) || + copy_in_user(&rxnfc->fs.ring_cookie, + &compat_rxnfc->fs.ring_cookie, + (void *)(&rxnfc->fs.location + 1) - + (void *)&rxnfc->fs.ring_cookie) || + copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) + return -EFAULT; + } + + ret = dev_ioctl(net, SIOCETHTOOL, ifr); + if (ret) + return ret; + + if (convert_out) { + if (copy_in_user(compat_rxnfc, rxnfc, + (const void *)(&rxnfc->fs.m_u + 1) - + (const void *)rxnfc) || + copy_in_user(&compat_rxnfc->fs.ring_cookie, + &rxnfc->fs.ring_cookie, + (const void *)(&rxnfc->fs.location + 1) - + (const void *)&rxnfc->fs.ring_cookie) || + copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) + return -EFAULT; + + if (ethcmd == ETHTOOL_GRXCLSRLALL) { + /* As an optimisation, we only copy the actual + * number of rules that the underlying + * function returned. Since Mallory might + * change the rule count in user memory, we + * check that it is less than the rule count + * originally given (as the user buffer size), + * which has been range-checked. + */ + if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) + return -EFAULT; + if (actual_rule_cnt < rule_cnt) + rule_cnt = actual_rule_cnt; + if (copy_in_user(&compat_rxnfc->rule_locs[0], + &rxnfc->rule_locs[0], + rule_cnt * sizeof(u32))) + return -EFAULT; + } + } + + return 0; } static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) -- cgit v1.1 From d870bfb9d366c5d466c0f5419a4ec95a3f71ea8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Mar 2011 00:27:27 +0000 Subject: vlan: should take into account needed_headroom Commit c95b819ad7 (gre: Use needed_headroom) made gre use needed_headroom instead of hard_header_len This uncover a bug in vlan code. We should make sure vlan devices take into account their real_dev->needed_headroom or we risk a crash in ipgre_header(), because we dont have enough room to push IP header in skb. Reported-by: Diddi Oscarsson Signed-off-by: Eric Dumazet Cc: Patrick McHardy Cc: Herbert Xu Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ae610f0..e34ea9e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -720,6 +720,7 @@ static int vlan_dev_init(struct net_device *dev) dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; #endif + dev->needed_headroom = real_dev->needed_headroom; if (real_dev->features & NETIF_F_HW_VLAN_TX) { dev->header_ops = real_dev->header_ops; dev->hard_header_len = real_dev->hard_header_len; -- cgit v1.1 From 6b1e960fdbd75dcd9bcc3ba5ff8898ff1ad30b6e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 18 Mar 2011 05:27:28 +0000 Subject: bridge: Reset IPCB when entering IP stack on NF_FORWARD Whenever we enter the IP stack proper from bridge netfilter we need to ensure that the skb is in a form the IP stack expects it to be in. The entry point on NF_FORWARD did not meet the requirements of the IP stack, therefore leading to potential crashes/panics. This patch fixes the problem. Signed-off-by: Herbert Xu Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index f97af559..008ff6c 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -739,6 +739,9 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, nf_bridge->mask |= BRNF_PKT_TYPE; } + if (br_parse_ip_options(skb)) + return NF_DROP; + /* The physdev module checks on this */ nf_bridge->mask |= BRNF_BRIDGED; nf_bridge->physoutdev = skb->dev; -- cgit v1.1 From b51bdad63046d1d5a4807630cc8c02845cf67893 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 18 Mar 2011 08:50:37 +0000 Subject: headers: use __aligned_xx types for userspace Now that we finally have __aligned_xx exported to userspace, convert the headers that get exported over to the proper type. Signed-off-by: Mike Frysinger Signed-off-by: David S. Miller --- include/linux/if_ppp.h | 16 ++++++++-------- include/linux/netfilter/nfnetlink_log.h | 4 ++-- include/linux/netfilter/nfnetlink_queue.h | 4 ++-- include/linux/netfilter/xt_connbytes.h | 4 ++-- include/linux/netfilter/xt_quota.h | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/if_ppp.h b/include/linux/if_ppp.h index fcef103..c9ad383 100644 --- a/include/linux/if_ppp.h +++ b/include/linux/if_ppp.h @@ -114,14 +114,14 @@ struct pppol2tp_ioc_stats { __u16 tunnel_id; /* redundant */ __u16 session_id; /* if zero, get tunnel stats */ __u32 using_ipsec:1; /* valid only for session_id == 0 */ - aligned_u64 tx_packets; - aligned_u64 tx_bytes; - aligned_u64 tx_errors; - aligned_u64 rx_packets; - aligned_u64 rx_bytes; - aligned_u64 rx_seq_discards; - aligned_u64 rx_oos_packets; - aligned_u64 rx_errors; + __aligned_u64 tx_packets; + __aligned_u64 tx_bytes; + __aligned_u64 tx_errors; + __aligned_u64 rx_packets; + __aligned_u64 rx_bytes; + __aligned_u64 rx_seq_discards; + __aligned_u64 rx_oos_packets; + __aligned_u64 rx_errors; }; #define ifr__name b.ifr_ifrn.ifrn_name diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h index ea9b8d3..90c2c95 100644 --- a/include/linux/netfilter/nfnetlink_log.h +++ b/include/linux/netfilter/nfnetlink_log.h @@ -28,8 +28,8 @@ struct nfulnl_msg_packet_hw { }; struct nfulnl_msg_packet_timestamp { - aligned_be64 sec; - aligned_be64 usec; + __aligned_be64 sec; + __aligned_be64 usec; }; enum nfulnl_attr_type { diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h index 2455fe5..af94e00 100644 --- a/include/linux/netfilter/nfnetlink_queue.h +++ b/include/linux/netfilter/nfnetlink_queue.h @@ -25,8 +25,8 @@ struct nfqnl_msg_packet_hw { }; struct nfqnl_msg_packet_timestamp { - aligned_be64 sec; - aligned_be64 usec; + __aligned_be64 sec; + __aligned_be64 usec; }; enum nfqnl_attr_type { diff --git a/include/linux/netfilter/xt_connbytes.h b/include/linux/netfilter/xt_connbytes.h index 92fcbb0..f1d6c15 100644 --- a/include/linux/netfilter/xt_connbytes.h +++ b/include/linux/netfilter/xt_connbytes.h @@ -17,8 +17,8 @@ enum xt_connbytes_direction { struct xt_connbytes_info { struct { - aligned_u64 from; /* count to be matched */ - aligned_u64 to; /* count to be matched */ + __aligned_u64 from; /* count to be matched */ + __aligned_u64 to; /* count to be matched */ } count; __u8 what; /* ipt_connbytes_what */ __u8 direction; /* ipt_connbytes_direction */ diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h index ca6e03e..9314723 100644 --- a/include/linux/netfilter/xt_quota.h +++ b/include/linux/netfilter/xt_quota.h @@ -13,7 +13,7 @@ struct xt_quota_priv; struct xt_quota_info { __u32 flags; __u32 pad; - aligned_u64 quota; + __aligned_u64 quota; /* Used internally by the kernel */ struct xt_quota_priv *master; -- cgit v1.1 From 93d03203d5a165d7a757546245dd1543dfe0ff80 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Mar 2011 21:53:03 -0700 Subject: ftmac100: use resource_size() The calculation is off-by-one. It should be "end - start + 1". This patch fixes it to use resource_size() instead. Oddly, the code already uses resource size correctly a couple lines earlier when it calls request_mem_region() for this memory. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ftmac100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ftmac100.c b/drivers/net/ftmac100.c index 1d6f4b8..a316619 100644 --- a/drivers/net/ftmac100.c +++ b/drivers/net/ftmac100.c @@ -1102,7 +1102,7 @@ static int ftmac100_probe(struct platform_device *pdev) goto err_req_mem; } - priv->base = ioremap(res->start, res->end - res->start); + priv->base = ioremap(res->start, resource_size(res)); if (!priv->base) { dev_err(&pdev->dev, "Failed to ioremap ethernet registers\n"); err = -EIO; -- cgit v1.1 From dadaa10b077133e5c03333131b82ecb13679af2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20de=20Peslo=C3=BCan?= Date: Sat, 19 Mar 2011 13:36:18 -0700 Subject: bonding: fix a typo in a comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nicolas de Pesloüan Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1a6e9eb..338bea1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2130,7 +2130,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) } /* -* First release a slave and than destroy the bond if no more slaves are left. +* First release a slave and then destroy the bond if no more slaves are left. * Must be under rtnl_lock when this function is called. */ static int bond_release_and_destroy(struct net_device *bond_dev, -- cgit v1.1 From b26fa4e0275426450238a14158bc1db24bb696e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Mar 2011 05:39:11 +0000 Subject: r8169: fix a bug in rtl8169_init_phy() commit 54405cde7624 (r8169: support control of advertising.) introduced a bug in rtl8169_init_phy() Reported-by: Piotr Hosowicz Signed-off-by: Eric Dumazet Cc: Oliver Neukum Cc: Francois Romieu Tested-by: Anca Emanuel Tested-by: Piotr Hosowicz Signed-off-by: David S. Miller --- drivers/net/r8169.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 5e40351..493b0de 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -2685,9 +2685,9 @@ static void rtl8169_init_phy(struct net_device *dev, struct rtl8169_private *tp) rtl8169_set_speed(dev, AUTONEG_ENABLE, SPEED_1000, DUPLEX_FULL, ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full | ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full | - tp->mii.supports_gmii ? + (tp->mii.supports_gmii ? ADVERTISED_1000baseT_Half | - ADVERTISED_1000baseT_Full : 0); + ADVERTISED_1000baseT_Full : 0)); if (RTL_R8(PHYstatus) & TBI_Enable) netif_info(tp, link, dev, "TBI auto-negotiating\n"); -- cgit v1.1 From a769f4968396093d5cc1b1a86204cef579784b24 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 19 Mar 2011 23:06:33 -0700 Subject: niu: Rename NIU parent platform device name to fix conflict. When the OF device driver bits were converted over to the platform device infrastructure in commit 74888760d40b3ac9054f9c5fa07b566c0676ba2d ("dt/net: Eliminate users of of_platform_{,un}register_driver") we inadvertantly created probing problems in the OF case. The NIU driver creates a dummy platform device to represent the board that contains one or more child NIU devices. Unfortunately we use the same name, "niu", as the OF device driver itself uses. The result is that we try to probe the dummy "niu" parent device we create, and since it has a NULL ofdevice pointer etc. everything explodes: [783019.128243] niu: niu.c:v1.1 (Apr 22, 2010) [783019.128810] Unable to handle kernel NULL pointer dereference [783019.128949] tsk->{mm,active_mm}->context = 000000000000039e [783019.129078] tsk->{mm,active_mm}->pgd = fffff803afc5a000 [783019.129206] \|/ ____ \|/ [783019.129213] "@'/ .. \`@" [783019.129220] /_| \__/ |_\ [783019.129226] \__U_/ [783019.129378] modprobe(2004): Oops [#1] [783019.129423] TSTATE: 0000000011001602 TPC: 0000000010052ff8 TNPC: 000000000061bbb4 Y: 00000000 Not tainted [783019.129542] TPC: [783019.129624] g0: 8080000000000000 g1: 0000000000000000 g2: 0000000010056000 g3: 0000000000000002 [783019.129733] g4: fffff803fc1da0c0 g5: fffff800441e2000 g6: fffff803fba84000 g7: 0000000000000000 [783019.129842] o0: fffff803fe7df010 o1: 0000000010055700 o2: 0000000000000000 o3: fffff803fbacaca0 [783019.129951] o4: 0000000000000080 o5: 0000000000777908 sp: fffff803fba866e1 ret_pc: 0000000010052ff4 [783019.130083] RPC: [783019.130165] l0: fffff803fe7df010 l1: fffff803fbacafc0 l2: fffff803fbacaca0 l3: ffffffffffffffed [783019.130273] l4: 0000000000000000 l5: 000000007fffffff l6: fffff803fba86f40 l7: 0000000000000001 [783019.130382] i0: fffff803fe7df000 i1: fffff803fc20aba0 i2: 0000000000000000 i3: 0000000000000001 [783019.130490] i4: 0000000000000000 i5: 0000000000000000 i6: fffff803fba867a1 i7: 000000000062038c [783019.130614] I7: Fix by simply renaming the parent device to "niu-board". Signed-off-by: David S. Miller --- drivers/net/niu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/niu.c b/drivers/net/niu.c index 40fa59e..32678b6 100644 --- a/drivers/net/niu.c +++ b/drivers/net/niu.c @@ -9501,7 +9501,7 @@ static struct niu_parent * __devinit niu_new_parent(struct niu *np, struct niu_parent *p; int i; - plat_dev = platform_device_register_simple("niu", niu_parent_index, + plat_dev = platform_device_register_simple("niu-board", niu_parent_index, NULL, 0); if (IS_ERR(plat_dev)) return NULL; -- cgit v1.1 From 5e0c1eb7e6b61998c7ecd39b7f69a15773d894d4 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sun, 20 Mar 2011 15:33:26 +0100 Subject: netfilter: ipset: fix address ranges at hash:*port* types The hash:*port* types with IPv4 silently ignored when address ranges with non TCP/UDP were added/deleted from the set and used the first address from the range only. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- include/linux/netfilter/ipset/ip_set_getport.h | 10 ++++++++ net/netfilter/ipset/ip_set_hash_ipport.c | 34 ++++++++------------------ net/netfilter/ipset/ip_set_hash_ipportip.c | 34 ++++++++------------------ net/netfilter/ipset/ip_set_hash_ipportnet.c | 34 ++++++++------------------ net/netfilter/ipset/ip_set_hash_netport.c | 30 ++++++----------------- 5 files changed, 48 insertions(+), 94 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h index 3882a81..5aebd17 100644 --- a/include/linux/netfilter/ipset/ip_set_getport.h +++ b/include/linux/netfilter/ipset/ip_set_getport.h @@ -18,4 +18,14 @@ static inline bool ip_set_get_ip6_port(const struct sk_buff *skb, bool src, extern bool ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port); +static inline bool ip_set_proto_with_ports(u8 proto) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + } + return false; +} + #endif /*_IP_SET_GETPORT_H*/ diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index adbe787..b921414 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -150,6 +150,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipport4_elem data = { }; u32 ip, ip_to, p, port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -172,21 +173,15 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -195,7 +190,6 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &data, timeout); @@ -219,13 +213,12 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else ip_to = ip; - port = ntohs(data.port); - if (tb[IPSET_ATTR_PORT_TO]) { + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); - } else - port_to = port; + } for (; !before(ip_to, ip); ip++) for (p = port; p <= port_to; p++) { @@ -361,6 +354,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipport6_elem data = { }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -385,21 +379,15 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -407,9 +395,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 22e23ab..4642872 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -154,6 +154,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportip4_elem data = { }; u32 ip, ip_to, p, port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -180,21 +181,15 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -203,7 +198,6 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &data, timeout); @@ -227,13 +221,12 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else ip_to = ip; - port = ntohs(data.port); - if (tb[IPSET_ATTR_PORT_TO]) { + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); - } else - port_to = port; + } for (; !before(ip_to, ip); ip++) for (p = port; p <= port_to; p++) { @@ -375,6 +368,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportip6_elem data = { }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -403,21 +397,15 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -425,9 +413,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 6033e8b..2cb84a5 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -174,6 +174,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportnet4_elem data = { .cidr = HOST_MASK }; u32 ip, ip_to, p, port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -208,21 +209,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -231,7 +226,6 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &data, timeout); @@ -255,13 +249,12 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else ip_to = ip; - port = ntohs(data.port); - if (tb[IPSET_ATTR_PORT_TO]) { + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); - } else - port_to = port; + } for (; !before(ip_to, ip); ip++) for (p = port; p <= port_to; p++) { @@ -429,6 +422,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportnet6_elem data = { .cidr = HOST_MASK }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -465,21 +459,15 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -487,9 +475,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 34a1656..8598676 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -170,6 +170,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netport4_elem data = { .cidr = HOST_MASK }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -198,21 +199,15 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -220,9 +215,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } @@ -390,6 +383,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netport6_elem data = { .cidr = HOST_MASK }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -418,21 +412,15 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -440,9 +428,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } -- cgit v1.1 From 5c1aba467828bf0574ec5754c84884d573f590af Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sun, 20 Mar 2011 15:35:01 +0100 Subject: netfilter: ipset: fix checking the type revision at create command The revision of the set type was not checked at the create command: if the userspace sent a valid set type but with not supported revision number, it'd create a loop. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- net/netfilter/ipset/ip_set_core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 618a615..d6b4823 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -94,16 +94,28 @@ static int find_set_type_get(const char *name, u8 family, u8 revision, struct ip_set_type **found) { + struct ip_set_type *type; + int err; + rcu_read_lock(); *found = find_set_type(name, family, revision); if (*found) { - int err = !try_module_get((*found)->me); - rcu_read_unlock(); - return err ? -EFAULT : 0; + err = !try_module_get((*found)->me) ? -EFAULT : 0; + goto unlock; } + /* Make sure the type is loaded but we don't support the revision */ + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name)) { + err = -IPSET_ERR_FIND_TYPE; + goto unlock; + } rcu_read_unlock(); return try_to_load_type(name); + +unlock: + rcu_read_unlock(); + return err; } /* Find a given set type by name and family. @@ -116,7 +128,7 @@ find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max) struct ip_set_type *type; bool found = false; - *min = *max = 0; + *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STREQ(type->name, name) && @@ -124,7 +136,7 @@ find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max) found = true; if (type->revision < *min) *min = type->revision; - else if (type->revision > *max) + if (type->revision > *max) *max = type->revision; } rcu_read_unlock(); -- cgit v1.1 From db856674ac69e31946e56085239757cca3f7655f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 20 Mar 2011 15:40:06 +0100 Subject: netfilter: xtables: fix reentrancy commit f3c5c1bfd4308 (make ip_tables reentrant) introduced a race in handling the stackptr restore, at the end of ipt_do_table() We should do it before the call to xt_info_rdunlock_bh(), or we allow cpu preemption and another cpu overwrites stackptr of original one. A second fix is to change the underflow test to check the origptr value instead of 0 to detect underflow, or else we allow a jump from different hooks. Signed-off-by: Eric Dumazet Cc: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_tables.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b09ed0d..ffcea0d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -387,7 +387,7 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) { + if (*stackptr <= origptr) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " @@ -427,10 +427,10 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c9598a9..0b2af9b 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -410,7 +410,7 @@ ip6t_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) + if (*stackptr <= origptr) e = get_entry(table_base, private->underflow[hook]); else @@ -441,8 +441,8 @@ ip6t_do_table(struct sk_buff *skb, break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); *stackptr = origptr; + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; -- cgit v1.1 From 961ed183a9fd080cf306c659b8736007e44065a5 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Sun, 20 Mar 2011 15:42:52 +0100 Subject: netfilter: ipt_CLUSTERIP: fix buffer overflow 'buffer' string is copied from userspace. It is not checked whether it is zero terminated. This may lead to overflow inside of simple_strtoul(). Changli Gao suggested to copy not more than user supplied 'size' bytes. It was introduced before the git epoch. Files "ipt_CLUSTERIP/*" are root writable only by default, however, on some setups permissions might be relaxed to e.g. network admin user. Signed-off-by: Vasiliy Kulikov Acked-by: Changli Gao Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 403ca57..d609ac3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -664,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, char buffer[PROC_WRITELEN+1]; unsigned long nodenum; - if (copy_from_user(buffer, input, PROC_WRITELEN)) + if (size > PROC_WRITELEN) + return -EIO; + if (copy_from_user(buffer, input, size)) return -EFAULT; + buffer[size] = 0; if (*buffer == '+') { nodenum = simple_strtoul(buffer+1, NULL, 10); -- cgit v1.1 From a454f0ccefbfdbfc0e1aa8a5f8010af5e48b8845 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 21 Mar 2011 18:08:28 -0700 Subject: xfrm: Fix initialize repl field of struct xfrm_state Commit 'xfrm: Move IPsec replay detection functions to a separate file' (9fdc4883d92d20842c5acea77a4a21bb1574b495) introduce repl field to struct xfrm_state, and only initialize it under SA's netlink create path, the other path, such as pf_key, ipcomp/ipcomp6 etc, the repl field remaining uninitialize. So if the SA is created by pf_key, any input packet with SA's encryption algorithm will cause panic. int xfrm_input() { ... x->repl->advance(x, seq); ... } This patch fixed it by introduce new function __xfrm_init_state(). Pid: 0, comm: swapper Not tainted 2.6.38-next+ #14 Bochs Bochs EIP: 0060:[] EFLAGS: 00010206 CPU: 0 EIP is at xfrm_input+0x31c/0x4cc EAX: dd839c00 EBX: 00000084 ECX: 00000000 EDX: 01000000 ESI: dd839c00 EDI: de3a0780 EBP: dec1de88 ESP: dec1de64 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process swapper (pid: 0, ti=dec1c000 task=c09c0f20 task.ti=c0992000) Stack: 00000000 00000000 00000002 c0ba27c0 00100000 01000000 de3a0798 c0ba27c0 00000033 dec1de98 c0786848 00000000 de3a0780 dec1dea4 c0786868 00000000 dec1debc c074ee56 e1da6b8c de3a0780 c074ed44 de3a07a8 dec1decc c074ef32 Call Trace: [] xfrm4_rcv_encap+0x22/0x27 [] xfrm4_rcv+0x1b/0x1d [] ip_local_deliver_finish+0x112/0x1b1 [] ? ip_local_deliver_finish+0x0/0x1b1 [] NF_HOOK.clone.1+0x3d/0x44 [] ip_local_deliver+0x3e/0x44 [] ? ip_local_deliver_finish+0x0/0x1b1 [] ip_rcv_finish+0x30a/0x332 [] ? ip_rcv_finish+0x0/0x332 [] NF_HOOK.clone.1+0x3d/0x44 [] ip_rcv+0x20b/0x247 [] ? ip_rcv_finish+0x0/0x332 [] __netif_receive_skb+0x373/0x399 [] netif_receive_skb+0x4b/0x51 [] cp_rx_poll+0x210/0x2c4 [8139cp] [] net_rx_action+0x9a/0x17d [] __do_softirq+0xa1/0x149 [] ? __do_softirq+0x0/0x149 Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- include/net/xfrm.h | 1 + net/xfrm/xfrm_state.c | 15 ++++++++++++++- net/xfrm/xfrm_user.c | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 42a8c32..cffa5dc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1430,6 +1430,7 @@ extern void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); extern u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); extern int xfrm_init_replay(struct xfrm_state *x); extern int xfrm_state_mtu(struct xfrm_state *x, int mtu); +extern int __xfrm_init_state(struct xfrm_state *x, bool init_replay); extern int xfrm_init_state(struct xfrm_state *x); extern int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d575f05..f83a3d1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1907,7 +1907,7 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu) return res; } -int xfrm_init_state(struct xfrm_state *x) +int __xfrm_init_state(struct xfrm_state *x, bool init_replay) { struct xfrm_state_afinfo *afinfo; struct xfrm_mode *inner_mode; @@ -1980,12 +1980,25 @@ int xfrm_init_state(struct xfrm_state *x) if (x->outer_mode == NULL) goto error; + if (init_replay) { + err = xfrm_init_replay(x); + if (err) + goto error; + } + x->km.state = XFRM_STATE_VALID; error: return err; } +EXPORT_SYMBOL(__xfrm_init_state); + +int xfrm_init_state(struct xfrm_state *x) +{ + return __xfrm_init_state(x, true); +} + EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 706385a..fc152d2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -511,7 +511,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - err = xfrm_init_state(x); + err = __xfrm_init_state(x, false); if (err) goto error; -- cgit v1.1 From 8aa525a9340da4227797a06221ca08399006635f Mon Sep 17 00:00:00 2001 From: James Chapman Date: Mon, 21 Mar 2011 18:10:25 -0700 Subject: l2tp: fix possible oops on l2tp_eth module unload A struct used in the l2tp_eth driver for registering network namespace ops was incorrectly marked as __net_initdata, leading to oops when module unloaded. BUG: unable to handle kernel paging request at ffffffffa00ec098 IP: [] ops_exit_list+0x7/0x4b PGD 142d067 PUD 1431063 PMD 195da8067 PTE 0 Oops: 0000 [#1] SMP last sysfs file: /sys/module/l2tp_eth/refcnt Call Trace: [] ? unregister_pernet_operations+0x32/0x93 [] ? unregister_pernet_device+0x2b/0x38 [] ? sys_delete_module+0x1b8/0x222 [] ? do_munmap+0x254/0x318 [] ? page_fault+0x25/0x30 [] ? system_call_fastpath+0x16/0x1b Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 8d9ce0a..a8193f5 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -283,7 +283,7 @@ static __net_init int l2tp_eth_init_net(struct net *net) return 0; } -static __net_initdata struct pernet_operations l2tp_eth_net_ops = { +static struct pernet_operations l2tp_eth_net_ops = { .init = l2tp_eth_init_net, .id = &l2tp_eth_net_id, .size = sizeof(struct l2tp_eth_net), -- cgit v1.1 From 20246a800389fe5442675c59863fec5a4f520c7c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Mar 2011 18:12:54 -0700 Subject: snmp: SNMP_UPD_PO_STATS_BH() always called from softirq We dont need to test if we run from softirq context, we definitely are. This saves few instructions in ip_rcv() & ip_rcv_finish() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/snmp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/snmp.h b/include/net/snmp.h index 762e2ab..27461d6 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -150,7 +150,7 @@ struct linux_xfrm_mib { #define SNMP_UPD_PO_STATS_BH(mib, basefield, addend) \ do { \ __typeof__(*mib[0]) *ptr = \ - __this_cpu_ptr((mib)[!in_softirq()]); \ + __this_cpu_ptr((mib)[0]); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend;\ } while (0) @@ -202,7 +202,7 @@ struct linux_xfrm_mib { #define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend) \ do { \ __typeof__(*mib[0]) *ptr; \ - ptr = __this_cpu_ptr((mib)[!in_softirq()]); \ + ptr = __this_cpu_ptr((mib)[0]); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ -- cgit v1.1 From 674f2115995b7b588cbf3540c9f9b2448a8c7ea8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Mar 2011 18:16:39 -0700 Subject: ipx: fix ipx_release() Commit b0d0d915d1d1a0 (remove the BKL) added a regression, because sock_put() can free memory while we are going to use it later. Fix is to delay sock_put() _after_ release_sock(). Reported-by: Ingo Molnar Tested-by: Ingo Molnar Signed-off-by: Eric Dumazet Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 2731b51..9680226 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -148,7 +148,6 @@ static void ipx_destroy_socket(struct sock *sk) ipx_remove_socket(sk); skb_queue_purge(&sk->sk_receive_queue); sk_refcnt_debug_dec(sk); - sock_put(sk); } /* @@ -1404,6 +1403,7 @@ static int ipx_release(struct socket *sock) sk_refcnt_debug_release(sk); ipx_destroy_socket(sk); release_sock(sk); + sock_put(sk); out: return 0; } -- cgit v1.1 From b20e7bbfc7a15a4182730f0936433145992b4b06 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 21 Mar 2011 18:18:00 -0700 Subject: net/appletalk: fix atalk_release use after free The BKL removal in appletalk introduced a use-after-free problem, where atalk_destroy_socket frees a sock, but we still release the socket lock on it. An easy fix is to take an extra reference on the sock and sock_put it when returning from atalk_release. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 3d4f4b0..206e771 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1051,6 +1051,7 @@ static int atalk_release(struct socket *sock) { struct sock *sk = sock->sk; + sock_hold(sk); lock_sock(sk); if (sk) { sock_orphan(sk); @@ -1058,6 +1059,8 @@ static int atalk_release(struct socket *sock) atalk_destroy_socket(sk); } release_sock(sk); + sock_put(sk); + return 0; } -- cgit v1.1 From 4f2d56c45fec7c15169599cab05e9f6df18769d0 Mon Sep 17 00:00:00 2001 From: Jan Altenberg Date: Mon, 21 Mar 2011 18:19:26 -0700 Subject: can: c_can: Do basic c_can configuration _before_ enabling the interrupts I ran into some trouble while testing the SocketCAN driver for the BOSCH C_CAN controller. The interface is not correctly initialized, if I put some CAN traffic on the line, _while_ the interface is being started (which means: the interface doesn't come up correcty, if there's some RX traffic while doing 'ifconfig can0 up'). The current implementation enables the controller interrupts _before_ doing the basic c_can configuration. I think, this should be done the other way round. The patch below fixes things for me. Signed-off-by: Jan Altenberg Acked-by: Kurt Van Dijck Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- drivers/net/can/c_can/c_can.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index 1405078..110eda0 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -633,9 +633,6 @@ static void c_can_start(struct net_device *dev) { struct c_can_priv *priv = netdev_priv(dev); - /* enable status change, error and module interrupts */ - c_can_enable_all_interrupts(priv, ENABLE_ALL_INTERRUPTS); - /* basic c_can configuration */ c_can_chip_config(dev); @@ -643,6 +640,9 @@ static void c_can_start(struct net_device *dev) /* reset tx helper pointers */ priv->tx_next = priv->tx_echo = 0; + + /* enable status change, error and module interrupts */ + c_can_enable_all_interrupts(priv, ENABLE_ALL_INTERRUPTS); } static void c_can_stop(struct net_device *dev) -- cgit v1.1 From ac0a121d7906b049dfee3649f886c969fbb3c1b7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 21 Mar 2011 18:20:26 -0700 Subject: net: fix incorrect spelling in drop monitor protocol It was pointed out to me recently that my spelling could be better :) Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 36e603c..706502f 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -350,7 +350,7 @@ static int __init init_net_drop_monitor(void) struct per_cpu_dm_data *data; int cpu, rc; - printk(KERN_INFO "Initalizing network drop monitor service\n"); + printk(KERN_INFO "Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); -- cgit v1.1 From d5cd92448fded12c91f7574e49747c5f7d975a8d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Mar 2011 18:22:22 -0700 Subject: macvlan: Fix use after free of struct macvlan_port. When the macvlan driver was extended to call unregisgter_netdevice_queue in 23289a37e2b127dfc4de1313fba15bb4c9f0cd5b, a use after free of struct macvlan_port was introduced. The code in dellink relied on unregister_netdevice actually unregistering the net device so it would be safe to free macvlan_port. Since unregister_netdevice_queue can just queue up the unregister instead of performing the unregiser immediately we free the macvlan_port too soon and then the code in macvlan_stop removes the macaddress for the set of macaddress to listen for and uses memory that has already been freed. To fix this add a reference count to track when it is safe to free the macvlan_port and move the call of macvlan_port_destroy into macvlan_uninit which is guaranteed to be called after the final macvlan_port_close. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 5b37d3c..78e34e9 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -39,8 +39,11 @@ struct macvlan_port { struct list_head vlans; struct rcu_head rcu; bool passthru; + int count; }; +static void macvlan_port_destroy(struct net_device *dev); + #define macvlan_port_get_rcu(dev) \ ((struct macvlan_port *) rcu_dereference(dev->rx_handler_data)) #define macvlan_port_get(dev) ((struct macvlan_port *) dev->rx_handler_data) @@ -457,8 +460,13 @@ static int macvlan_init(struct net_device *dev) static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); + struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); + + port->count -= 1; + if (!port->count) + macvlan_port_destroy(port->dev); } static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev, @@ -691,12 +699,13 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { - if (!list_empty(&port->vlans)) + if (port->count) return -EINVAL; port->passthru = true; memcpy(dev->dev_addr, lowerdev->dev_addr, ETH_ALEN); } + port->count += 1; err = register_netdevice(dev); if (err < 0) goto destroy_port; @@ -707,7 +716,8 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, return 0; destroy_port: - if (list_empty(&port->vlans)) + port->count -= 1; + if (!port->count) macvlan_port_destroy(lowerdev); return err; @@ -725,13 +735,9 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); - struct macvlan_port *port = vlan->port; list_del(&vlan->list); unregister_netdevice_queue(dev, head); - - if (list_empty(&port->vlans)) - macvlan_port_destroy(port->dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); -- cgit v1.1 From 9d2a8fa96a44ba242de3a6f56acaef7a40a97b97 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Mar 2011 18:23:34 -0700 Subject: net ipv6: Fix duplicate /proc/sys/net/ipv6/neigh directory entries. When I was fixing issues with unregisgtering tables under /proc/sys/net/ipv6/neigh by adding a mount point it appears I missed a critical ordering issue, in the ipv6 initialization. I had not realized that ipv6_sysctl_register is called at the very end of the ipv6 initialization and in particular after we call neigh_sysctl_register from ndisc_init. "neigh" needs to be initialized in ipv6_static_sysctl_register which is the first ipv6 table to initialized, and definitely before ndisc_init. This removes the weirdness of duplicate tables while still providing a "neigh" mount point which prevents races in sysctl unregistering. This was initially reported at https://bugzilla.kernel.org/show_bug.cgi?id=31232 Reported-by: sunkan@zappa.cx Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv6/sysctl_net_ipv6.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 7cb65ef..6dcf5e7 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,16 @@ static struct ctl_table empty[1]; +static ctl_table ipv6_static_skeleton[] = { + { + .procname = "neigh", + .maxlen = 0, + .mode = 0555, + .child = empty, + }, + { } +}; + static ctl_table ipv6_table_template[] = { { .procname = "route", @@ -37,12 +47,6 @@ static ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "neigh", - .maxlen = 0, - .mode = 0555, - .child = empty, - }, { } }; @@ -160,7 +164,7 @@ static struct ctl_table_header *ip6_base; int ipv6_static_sysctl_register(void) { - ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty); + ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton); if (ip6_base == NULL) return -ENOMEM; return 0; -- cgit v1.1 From 675071a2ef3f4a6d25ee002a7437d50431168344 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Mar 2011 18:24:53 -0700 Subject: veth: Fix the byte counters Commit 44540960 "veth: move loopback logic to common location" introduced a bug in the packet counters. I don't understand why that happened as it is not explained in the comments and the mut check in dev_forward_skb retains the assumption that skb->len is the total length of the packet. I just measured this emperically by setting up a veth pair between two noop network namespaces setting and attempting a telnet connection between the two. I saw three packets in each direction and the byte counters were exactly 14*3 = 42 bytes high in each direction. I got the actual packet lengths with tcpdump. So remove the extra ETH_HLEN from the veth byte count totals. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- drivers/net/veth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 105d7f0..2de9b90 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -171,7 +171,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = rcv_priv->ip_summed; - length = skb->len + ETH_HLEN; + length = skb->len; if (dev_forward_skb(rcv, skb) != NET_RX_SUCCESS) goto rx_drop; -- cgit v1.1 From f40f94fc6c3b5a5542d5ed976e9ff69a3b463802 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Mar 2011 10:15:40 +0000 Subject: ipvs: fix a typo in __ip_vs_control_init() Reported-by: Ingo Molnar Signed-off-by: Eric Dumazet Cc: Simon Horman Cc: Julian Anastasov Acked-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b799cea..33733c8 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3605,7 +3605,7 @@ int __net_init __ip_vs_control_init(struct net *net) /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (ipvs->tot_stats.cpustats) { + if (!ipvs->tot_stats.cpustats) { pr_err("%s(): alloc_percpu.\n", __func__); return -ENOMEM; } -- cgit v1.1 From 736561a01f11114146b1b7f82d486fa9c95828ef Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 21 Mar 2011 15:18:01 +0000 Subject: IPVS: Use global mutex in ip_vs_app.c As part of the work to make IPVS network namespace aware __ip_vs_app_mutex was replaced by a per-namespace lock, ipvs->app_mutex. ipvs->app_key is also supplied for debugging purposes. Unfortunately this implementation results in ipvs->app_key residing in non-static storage which at the very least causes a lockdep warning. This patch takes the rather heavy-handed approach of reinstating __ip_vs_app_mutex which will cover access to the ipvs->list_head of all network namespaces. [ 12.610000] IPVS: Creating netns size=2456 id=0 [ 12.630000] IPVS: Registered protocols (TCP, UDP, SCTP, AH, ESP) [ 12.640000] BUG: key ffff880003bbf1a0 not in .data! [ 12.640000] ------------[ cut here ]------------ [ 12.640000] WARNING: at kernel/lockdep.c:2701 lockdep_init_map+0x37b/0x570() [ 12.640000] Hardware name: Bochs [ 12.640000] Pid: 1, comm: swapper Tainted: G W 2.6.38-kexec-06330-g69b7efe-dirty #122 [ 12.650000] Call Trace: [ 12.650000] [] warn_slowpath_common+0x75/0xb0 [ 12.650000] [] warn_slowpath_null+0x15/0x20 [ 12.650000] [] lockdep_init_map+0x37b/0x570 [ 12.650000] [] ? trace_hardirqs_on+0xd/0x10 [ 12.650000] [] debug_mutex_init+0x38/0x50 [ 12.650000] [] __mutex_init+0x5c/0x70 [ 12.650000] [] __ip_vs_app_init+0x64/0x86 [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.660000] [] T.620+0x43/0x170 [ 12.660000] [] ? register_pernet_subsys+0x1a/0x40 [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.660000] [] register_pernet_operations+0x57/0xb0 [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.670000] [] register_pernet_subsys+0x29/0x40 [ 12.670000] [] ip_vs_app_init+0x10/0x12 [ 12.670000] [] ip_vs_init+0x4c/0xff [ 12.670000] [] do_one_initcall+0x7a/0x12e [ 12.670000] [] kernel_init+0x13e/0x1c2 [ 12.670000] [] kernel_thread_helper+0x4/0x10 [ 12.670000] [] ? restore_args+0x0/0x30 [ 12.680000] [] ? kernel_init+0x0/0x1c2 [ 12.680000] [] ? kernel_thread_helper+0x0/0x1global0 Signed-off-by: Simon Horman Cc: Ingo Molnar Cc: Eric Dumazet Cc: Julian Anastasov Cc: Hans Schillstrom Signed-off-by: David S. Miller --- include/net/ip_vs.h | 2 -- net/netfilter/ipvs/ip_vs_app.c | 23 ++++++++++------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 272f593..30b49ed 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -801,8 +801,6 @@ struct netns_ipvs { struct list_head rs_table[IP_VS_RTAB_SIZE]; /* ip_vs_app */ struct list_head app_list; - struct mutex app_mutex; - struct lock_class_key app_key; /* mutex debuging */ /* ip_vs_proto */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 5c48ffb..2dc6de1 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -43,6 +43,8 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); +static DEFINE_MUTEX(__ip_vs_app_mutex); + /* * Get an ip_vs_app object */ @@ -167,14 +169,13 @@ int register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, __u16 port) { - struct netns_ipvs *ipvs = net_ipvs(net); int result; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); result = ip_vs_app_inc_new(net, app, proto, port); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return result; } @@ -189,11 +190,11 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) /* increase the module use count */ ip_vs_use_count_inc(); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_add(&app->a_list, &ipvs->app_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return 0; } @@ -205,10 +206,9 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *inc, *nxt; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { ip_vs_app_inc_release(net, inc); @@ -216,7 +216,7 @@ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) list_del(&app->a_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -501,7 +501,7 @@ static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } @@ -535,9 +535,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { - struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq)); - - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) @@ -583,7 +581,6 @@ static int __net_init __ip_vs_app_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } -- cgit v1.1