diff options
Diffstat (limited to 'net')
58 files changed, 1076 insertions, 437 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 347ec0c..2ccbf04 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -223,7 +223,7 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) req = &c->reqs[row][col]; if (!req->tc) { - req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); if (!req->wq) { printk(KERN_ERR "Couldn't grow tag array\n"); return ERR_PTR(-ENOMEM); @@ -233,17 +233,17 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) P9_TRANS_PREF_PAYLOAD_SEP) { int alloc_msize = min(c->msize, 4096); req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = alloc_msize; req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = alloc_msize; } else { req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = c->msize; req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = c->msize; } if ((!req->tc) || (!req->rc)) { diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 2ce515b..8a4084f 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -205,7 +205,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (errcode) break; - *sptr = kmalloc(len + 1, GFP_KERNEL); + *sptr = kmalloc(len + 1, GFP_NOFS); if (*sptr == NULL) { errcode = -EFAULT; break; @@ -273,7 +273,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (!errcode) { *wnames = kmalloc(sizeof(char *) * *nwname, - GFP_KERNEL); + GFP_NOFS); if (!*wnames) errcode = -ENOMEM; } @@ -317,7 +317,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, *wqids = kmalloc(*nwqid * sizeof(struct p9_qid), - GFP_KERNEL); + GFP_NOFS); if (*wqids == NULL) errcode = -ENOMEM; } diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index d62b9aa..9172ab7 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -41,9 +41,9 @@ EXPORT_SYMBOL(p9_release_req_pages); int p9_nr_pages(struct p9_req_t *req) { - int start_page, end_page; - start_page = (unsigned long long)req->tc->pubuf >> PAGE_SHIFT; - end_page = ((unsigned long long)req->tc->pubuf + req->tc->pbuf_size + + unsigned long start_page, end_page; + start_page = (unsigned long)req->tc->pubuf >> PAGE_SHIFT; + end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; return end_page - start_page; } @@ -69,8 +69,8 @@ p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, *pdata_off = (size_t)req->tc->pubuf & (PAGE_SIZE-1); if (*pdata_off) - first_page_bytes = min((PAGE_SIZE - *pdata_off), - req->tc->pbuf_size); + first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off), + req->tc->pbuf_size); rpinfo = req->tc->private; pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index a30471e..aa5672b 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -350,7 +350,7 @@ static void p9_read_work(struct work_struct *work) if (m->req->rc == NULL) { m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_KERNEL); + m->client->msize, GFP_NOFS); if (!m->req->rc) { m->req = NULL; err = -ENOMEM; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 29a54cc..150e0c4 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -424,7 +424,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) struct p9_rdma_context *rpl_context = NULL; /* Allocate an fcall for the reply */ - rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); + rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto err_close; @@ -437,7 +437,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) */ if (!req->rc) { req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, - GFP_KERNEL); + GFP_NOFS); if (req->rc) { req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); @@ -468,7 +468,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) req->rc = NULL; /* Post the request */ - c = kmalloc(sizeof *c, GFP_KERNEL); + c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto err_free1; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 9b550ed..e8f046b 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -43,6 +43,7 @@ #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/scatterlist.h> +#include <linux/swap.h> #include <linux/virtio.h> #include <linux/virtio_9p.h> #include "trans_common.h" @@ -51,6 +52,8 @@ /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); +static DECLARE_WAIT_QUEUE_HEAD(vp_wq); +static atomic_t vp_pinned = ATOMIC_INIT(0); /** * struct virtio_chan - per-instance transport information @@ -78,7 +81,10 @@ struct virtio_chan { struct virtqueue *vq; int ring_bufs_avail; wait_queue_head_t *vc_wq; - + /* This is global limit. Since we don't have a global structure, + * will be placing it in each channel. + */ + int p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; @@ -141,34 +147,36 @@ static void req_done(struct virtqueue *vq) P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); - do { + while (1) { spin_lock_irqsave(&chan->lock, flags); rc = virtqueue_get_buf(chan->vq, &len); - if (rc != NULL) { - if (!chan->ring_bufs_avail) { - chan->ring_bufs_avail = 1; - wake_up(chan->vc_wq); - } - spin_unlock_irqrestore(&chan->lock, flags); - P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); - P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", - rc->tag); - req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - if (req->tc->private) { - struct trans_rpage_info *rp = req->tc->private; - /*Release pages */ - p9_release_req_pages(rp); - if (rp->rp_alloc) - kfree(rp); - req->tc->private = NULL; - } - p9_client_cb(chan->client, req); - } else { + if (rc == NULL) { spin_unlock_irqrestore(&chan->lock, flags); + break; + } + + chan->ring_bufs_avail = 1; + spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); + P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); + P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); + req = p9_tag_lookup(chan->client, rc->tag); + if (req->tc->private) { + struct trans_rpage_info *rp = req->tc->private; + int p = rp->rp_nr_pages; + /*Release pages */ + p9_release_req_pages(rp); + atomic_sub(p, &vp_pinned); + wake_up(&vp_wq); + if (rp->rp_alloc) + kfree(rp); + req->tc->private = NULL; } - } while (rc != NULL); + req->status = REQ_STATUS_RCVD; + p9_client_cb(chan->client, req); + } } /** @@ -263,7 +271,6 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); -req_retry: req->status = REQ_STATUS_SENT; if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) { @@ -271,6 +278,14 @@ req_retry: int rpinfo_size = sizeof(struct trans_rpage_info) + sizeof(struct page *) * nr_pages; + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_interruptible(vp_wq, + atomic_read(&vp_pinned) < chan->p9_max_pages); + if (err == -ERESTARTSYS) + return err; + P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n"); + } + if (rpinfo_size <= (req->tc->capacity - req->tc->size)) { /* We can use sdata */ req->tc->private = req->tc->sdata + req->tc->size; @@ -293,9 +308,12 @@ req_retry: if (rpinfo->rp_alloc) kfree(rpinfo); return err; + } else { + atomic_add(rpinfo->rp_nr_pages, &vp_pinned); } } +req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); /* Handle out VirtIO ring buffers */ @@ -356,7 +374,7 @@ req_retry: return err; P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); - goto req_retry; + goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); P9_DPRINTK(P9_DEBUG_TRANS, @@ -453,6 +471,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; + /* Ceiling limit to avoid denial of service attacks */ + chan->p9_max_pages = nr_free_buffer_pages()/4; mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); diff --git a/net/9p/util.c b/net/9p/util.c index e048701..b84619b 100644 --- a/net/9p/util.c +++ b/net/9p/util.c @@ -92,7 +92,7 @@ int p9_idpool_get(struct p9_idpool *p) unsigned long flags; retry: - if (idr_pre_get(&p->pool, GFP_KERNEL) == 0) + if (idr_pre_get(&p->pool, GFP_NOFS) == 0) return 0; spin_lock_irqsave(&p->lock, flags); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 3d4f4b0..956a530 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1051,13 +1051,17 @@ static int atalk_release(struct socket *sock) { struct sock *sk = sock->sk; - lock_sock(sk); if (sk) { + sock_hold(sk); + lock_sock(sk); + sock_orphan(sk); sock->sk = NULL; atalk_destroy_socket(sk); + + release_sock(sk); + sock_put(sk); } - release_sock(sk); return 0; } diff --git a/net/atm/common.c b/net/atm/common.c index 1b9c52a..22b963d 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -252,6 +252,7 @@ void atm_dev_release_vccs(struct atm_dev *dev) } write_unlock_irq(&vcc_sklist_lock); } +EXPORT_SYMBOL(atm_dev_release_vccs); static int adjust_tp(struct atm_trafprm *tp, unsigned char aal) { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index dce8f00..718b603 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -389,6 +389,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + bool changed_addr; /* Don't allow bridging non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || @@ -446,7 +447,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) list_add_rcu(&p->list, &br->port_list); spin_lock_bh(&br->lock); - br_stp_recalculate_bridge_id(br); + changed_addr = br_stp_recalculate_bridge_id(br); br_features_recompute(br); if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && @@ -456,6 +457,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) br_ifinfo_notify(RTM_NEWLINK, p); + if (changed_addr) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + dev_set_mtu(br->dev, br_min_mtu(br)); kobject_uevent(&p->kobj, KOBJ_ADD); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 030a002..59660c9 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -445,9 +445,9 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->payload_len = htons(8 + sizeof(*mldq)); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; + ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, &ip6h->saddr); - ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); @@ -1475,7 +1475,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, ip6h->payload_len == 0) return 0; - len = ntohs(ip6h->payload_len); + len = ntohs(ip6h->payload_len) + sizeof(*ip6h); if (skb->len < len) return -EINVAL; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 19e2f46..387013d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -497,7 +497,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br); extern void br_stp_set_enabled(struct net_bridge *br, unsigned long val); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); -extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern bool br_stp_recalculate_bridge_id(struct net_bridge *br); extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 79372d4..9b61d09 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -204,7 +204,7 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ -void br_stp_recalculate_bridge_id(struct net_bridge *br) +bool br_stp_recalculate_bridge_id(struct net_bridge *br) { const unsigned char *br_mac_zero = (const unsigned char *)br_mac_zero_aligned; @@ -213,7 +213,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) /* user has chosen a value so keep it */ if (br->flags & BR_SET_MAC_ADDR) - return; + return false; list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || @@ -222,8 +222,11 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) } - if (compare_ether_addr(br->bridge_id.addr, addr)) - br_stp_change_bridge_id(br, addr); + if (compare_ether_addr(br->bridge_id.addr, addr) == 0) + return false; /* no change */ + + br_stp_change_bridge_id(br, addr); + return true; } /* called under bridge lock */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 702be5a..733d66f 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -95,7 +95,7 @@ struct s_pstats can_pstats; /* receive list statistics */ * af_can socket functions */ -static int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -108,6 +108,7 @@ static int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -ENOIOCTLCMD; } } +EXPORT_SYMBOL(can_ioctl); static void can_sock_destruct(struct sock *sk) { @@ -698,13 +699,9 @@ int can_proto_register(struct can_proto *cp) printk(KERN_ERR "can: protocol %d already registered\n", proto); err = -EBUSY; - } else { + } else proto_tab[proto] = cp; - /* use generic ioctl function if not defined by module */ - if (!cp->ops->ioctl) - cp->ops->ioctl = can_ioctl; - } spin_unlock(&proto_tab_lock); if (err < 0) diff --git a/net/can/bcm.c b/net/can/bcm.c index 092dc88..871a0ad 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1569,7 +1569,7 @@ static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, return size; } -static struct proto_ops bcm_ops __read_mostly = { +static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, @@ -1578,7 +1578,7 @@ static struct proto_ops bcm_ops __read_mostly = { .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, - .ioctl = NULL, /* use can_ioctl() from af_can.c */ + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/can/raw.c b/net/can/raw.c index 883e9d7..649acfa 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -742,7 +742,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, return size; } -static struct proto_ops raw_ops __read_mostly = { +static const struct proto_ops raw_ops = { .family = PF_CAN, .release = raw_release, .bind = raw_bind, @@ -751,7 +751,7 @@ static struct proto_ops raw_ops __read_mostly = { .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, - .ioctl = NULL, /* use can_ioctl() from af_can.c */ + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, diff --git a/net/ceph/armor.c b/net/ceph/armor.c index eb2a666..1fc1ee1 100644 --- a/net/ceph/armor.c +++ b/net/ceph/armor.c @@ -78,8 +78,10 @@ int ceph_unarmor(char *dst, const char *src, const char *end) while (src < end) { int a, b, c, d; - if (src < end && src[0] == '\n') + if (src[0] == '\n') { src++; + continue; + } if (src + 4 > end) return -EINVAL; a = decode_bits(src[0]); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f3e4a13..95f96ab 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -62,6 +62,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_OSD_MAP: return "osd_map"; case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; default: return "unknown"; } } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3e20a12..02212ed 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -22,10 +22,15 @@ #define OSD_OPREPLY_FRONT_LEN 512 static const struct ceph_connection_operations osd_con_ops; -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd); -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void send_queued(struct ceph_osd_client *osdc); +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static int __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); static int op_needs_trail(int op) { @@ -34,6 +39,7 @@ static int op_needs_trail(int op) case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY: return 1; default: return 0; @@ -209,6 +215,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, init_completion(&req->r_completion); init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd); req->r_flags = flags; WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); @@ -315,6 +323,24 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_STARTSYNC: break; + case CEPH_OSD_OP_NOTIFY: + { + __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); + __le32 timeout = cpu_to_le32(src->watch.timeout); + + BUG_ON(!req->r_trail); + + ceph_pagelist_append(req->r_trail, + &prot_ver, sizeof(prot_ver)); + ceph_pagelist_append(req->r_trail, + &timeout, sizeof(timeout)); + } + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; default: pr_err("unrecognized osd opcode %d\n", dst->op); WARN_ON(1); @@ -529,6 +555,45 @@ __lookup_request_ge(struct ceph_osd_client *osdc, return NULL; } +/* + * Resubmit requests pending on the given osd. + */ +static void __kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + struct ceph_osd_request *req, *nreq; + int err; + + dout("__kick_osd_requests osd%d\n", osd->o_osd); + err = __reset_osd(osdc, osd); + if (err == -EAGAIN) + return; + + list_for_each_entry(req, &osd->o_requests, r_osd_item) { + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, + r_linger_osd) { + __unregister_linger_request(osdc, req); + __register_request(osdc, req); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + } +} + +static void kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, kickosd); + mutex_unlock(&osdc->request_mutex); +} /* * If the osd connection drops, we need to resubmit all requests. @@ -543,7 +608,8 @@ static void osd_reset(struct ceph_connection *con) dout("osd_reset osd%d\n", osd->o_osd); osdc = osd->o_osdc; down_read(&osdc->map_sem); - kick_requests(osdc, osd); + kick_osd_requests(osdc, osd); + send_queued(osdc); up_read(&osdc->map_sem); } @@ -561,6 +627,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); osd->o_incarnation = 1; @@ -650,7 +717,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) int ret = 0; dout("__reset_osd %p osd%d\n", osd, osd->o_osd); - if (list_empty(&osd->o_requests)) { + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], &osd->o_con.peer_addr, @@ -723,10 +791,9 @@ static void __cancel_osd_timeout(struct ceph_osd_client *osdc) * Register request, assign tid. If this is the first request, set up * the timeout event. */ -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { - mutex_lock(&osdc->request_mutex); req->r_tid = ++osdc->last_tid; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); INIT_LIST_HEAD(&req->r_req_lru_item); @@ -740,6 +807,13 @@ static void register_request(struct ceph_osd_client *osdc, dout(" first request, scheduling timeout\n"); __schedule_osd_timeout(osdc); } +} + +static void register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + __register_request(osdc, req); mutex_unlock(&osdc->request_mutex); } @@ -758,9 +832,14 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_con_revoke(&req->r_osd->o_con, req->r_request); list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests)) + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); __move_osd_to_lru(osdc, req->r_osd); - req->r_osd = NULL; + } + if (list_empty(&req->r_osd_item) && + list_empty(&req->r_linger_item)) + req->r_osd = NULL; } ceph_osdc_put_request(req); @@ -781,20 +860,72 @@ static void __cancel_request(struct ceph_osd_request *req) ceph_con_revoke(&req->r_osd->o_con, req->r_request); req->r_sent = 0; } - list_del_init(&req->r_req_lru_item); } +static void __register_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__register_linger_request %p\n", req); + list_add_tail(&req->r_linger_item, &osdc->req_linger); + list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); +} + +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__unregister_linger_request %p\n", req); + if (req->r_osd) { + list_del_init(&req->r_linger_item); + list_del_init(&req->r_linger_osd); + + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); + __move_osd_to_lru(osdc, req->r_osd); + } + req->r_osd = NULL; + } +} + +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + if (req->r_linger) { + __unregister_linger_request(osdc, req); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); +} +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); + +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + if (!req->r_linger) { + dout("set_request_linger %p\n", req); + req->r_linger = 1; + /* + * caller is now responsible for calling + * unregister_linger_request + */ + ceph_osdc_get_request(req); + } +} +EXPORT_SYMBOL(ceph_osdc_set_request_linger); + /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is - * no up osd, set r_osd to NULL. + * no up osd, set r_osd to NULL. Move the request to the appropiate list + * (unsent, homeless) or leave on in-flight lru. * * Return 0 if unchanged, 1 if changed, or negative on error. * * Caller should hold map_sem for read and request_mutex. */ -static int __map_osds(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static int __map_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; @@ -802,11 +933,13 @@ static int __map_osds(struct ceph_osd_client *osdc, int o = -1, num = 0; int err; - dout("map_osds %p tid %lld\n", req, req->r_tid); + dout("map_request %p tid %lld\n", req, req->r_tid); err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, &req->r_file_layout, osdc->osdmap); - if (err) + if (err) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; + } pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; @@ -823,7 +956,7 @@ static int __map_osds(struct ceph_osd_client *osdc, (req->r_osd == NULL && o == -1)) return 0; /* no change */ - dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", + dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, req->r_osd ? req->r_osd->o_osd : -1); @@ -841,10 +974,12 @@ static int __map_osds(struct ceph_osd_client *osdc, if (!req->r_osd && o >= 0) { err = -ENOMEM; req->r_osd = create_osd(osdc); - if (!req->r_osd) + if (!req->r_osd) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; + } - dout("map_osds osd %p is osd%d\n", req->r_osd, o); + dout("map_request osd %p is osd%d\n", req->r_osd, o); req->r_osd->o_osd = o; req->r_osd->o_con.peer_name.num = cpu_to_le64(o); __insert_osd(osdc, req->r_osd); @@ -855,6 +990,9 @@ static int __map_osds(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + } else { + list_move(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ @@ -869,16 +1007,6 @@ static int __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead; - int err; - - err = __map_osds(osdc, req); - if (err < 0) - return err; - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - return 0; - } dout("send_request %p tid %llu to osd%d flags %d\n", req, req->r_tid, req->r_osd->o_osd, req->r_flags); @@ -898,6 +1026,21 @@ static int __send_request(struct ceph_osd_client *osdc, } /* + * Send any requests in the queue (req_unsent). + */ +static void send_queued(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *tmp; + + dout("send_queued\n"); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { + __send_request(osdc, req); + } + mutex_unlock(&osdc->request_mutex); +} + +/* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this * happens, we ping all OSDs with requests who have timed out to @@ -916,30 +1059,13 @@ static void handle_timeout(struct work_struct *work) unsigned long keepalive = osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; - struct rb_node *p; struct list_head slow_osds; - dout("timeout\n"); down_read(&osdc->map_sem); ceph_monc_request_next_osdmap(&osdc->client->monc); mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - int err; - - dout("osdc resending prev failed %lld\n", req->r_tid); - err = __send_request(osdc, req); - if (err) - dout("osdc failed again on %lld\n", req->r_tid); - else - req->r_resend = false; - continue; - } - } /* * reset osds that appear to be _really_ unresponsive. this @@ -963,7 +1089,7 @@ static void handle_timeout(struct work_struct *work) BUG_ON(!osd); pr_warning(" tid %llu timed out on osd%d, will reset osd\n", req->r_tid, osd->o_osd); - __kick_requests(osdc, osd); + __kick_osd_requests(osdc, osd); } /* @@ -991,7 +1117,7 @@ static void handle_timeout(struct work_struct *work) __schedule_osd_timeout(osdc); mutex_unlock(&osdc->request_mutex); - + send_queued(osdc); up_read(&osdc->map_sem); } @@ -1035,7 +1161,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, numops * sizeof(struct ceph_osd_op)) goto bad; dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); - /* lookup */ mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); @@ -1079,6 +1204,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, dout("handle_reply tid %llu flags %d\n", tid, flags); + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) + __register_linger_request(osdc, req); + /* either this is a read, or we got the safe response */ if (result < 0 || (flags & CEPH_OSD_FLAG_ONDISK) || @@ -1099,6 +1227,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, } done: + dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; @@ -1109,108 +1238,83 @@ bad: ceph_msg_dump(msg); } - -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) +static void reset_changed_osds(struct ceph_osd_client *osdc) { - struct ceph_osd_request *req; struct rb_node *p, *n; - int needmap = 0; - int err; - dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); - if (kickosd) { - err = __reset_osd(osdc, kickosd); - if (err == -EAGAIN) - return 1; - } else { - for (p = rb_first(&osdc->osds); p; p = n) { - struct ceph_osd *osd = - rb_entry(p, struct ceph_osd, o_node); - - n = rb_next(p); - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - memcmp(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd), - sizeof(struct ceph_entity_addr)) != 0) - __reset_osd(osdc, osd); - } + for (p = rb_first(&osdc->osds); p; p = n) { + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + + n = rb_next(p); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) + __reset_osd(osdc, osd); } +} + +/* + * Requeue requests whose mapping to an OSD has changed. If requests map to + * no osd, request a new map. + * + * Caller should hold map_sem for read and request_mutex. + */ +static void kick_requests(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *nreq; + struct rb_node *p; + int needmap = 0; + int err; + dout("kick_requests\n"); + mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - dout(" r_resend set on tid %llu\n", req->r_tid); - __cancel_request(req); - goto kick; - } - if (req->r_osd && kickosd == req->r_osd) { - __cancel_request(req); - goto kick; + err = __map_request(osdc, req); + if (err < 0) + continue; /* error */ + if (req->r_osd == NULL) { + dout("%p tid %llu maps to no osd\n", req, req->r_tid); + needmap++; /* request a newer map */ + } else if (err > 0) { + dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; } + } + + list_for_each_entry_safe(req, nreq, &osdc->req_linger, + r_linger_item) { + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - err = __map_osds(osdc, req); + err = __map_request(osdc, req); if (err == 0) - continue; /* no change */ - if (err < 0) { - /* - * FIXME: really, we should set the request - * error and fail if this isn't a 'nofail' - * request, but that's a fair bit more - * complicated to do. So retry! - */ - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - continue; - } + continue; /* no change and no osd was specified */ + if (err < 0) + continue; /* hrm! */ if (req->r_osd == NULL) { dout("tid %llu maps to no valid osd\n", req->r_tid); needmap++; /* request a newer map */ continue; } -kick: - dout("kicking %p tid %llu osd%d\n", req, req->r_tid, + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - err = __send_request(osdc, req); - if (err) { - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - } + __unregister_linger_request(osdc, req); + __register_request(osdc, req); } - - return needmap; -} - -/* - * Resubmit osd requests whose osd or osd address has changed. Request - * a new osd map if osds are down, or we are otherwise unable to determine - * how to direct a request. - * - * Close connections to down osds. - * - * If @who is specified, resubmit requests for that specific osd. - * - * Caller should hold map_sem for read and request_mutex. - */ -static void kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) -{ - int needmap; - - mutex_lock(&osdc->request_mutex); - needmap = __kick_requests(osdc, kickosd); mutex_unlock(&osdc->request_mutex); if (needmap) { dout("%d requests for down osds, need new map\n", needmap); ceph_monc_request_next_osdmap(&osdc->client->monc); } - } + + /* * Process updated osd map. * @@ -1263,6 +1367,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } + kick_requests(osdc); + reset_changed_osds(osdc); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1300,6 +1406,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap = newmap; if (oldmap) ceph_osdmap_destroy(oldmap); + kick_requests(osdc); } p += maplen; nr_maps--; @@ -1308,8 +1415,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); - if (newmap) - kick_requests(osdc, NULL); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); return; @@ -1322,6 +1428,223 @@ bad: } /* + * watch/notify callback event infrastructure + * + * These callbacks are used both for watch and notify operations. + */ +static void __release_event(struct kref *kref) +{ + struct ceph_osd_event *event = + container_of(kref, struct ceph_osd_event, kref); + + dout("__release_event %p\n", event); + kfree(event); +} + +static void get_event(struct ceph_osd_event *event) +{ + kref_get(&event->kref); +} + +void ceph_osdc_put_event(struct ceph_osd_event *event) +{ + kref_put(&event->kref, __release_event); +} +EXPORT_SYMBOL(ceph_osdc_put_event); + +static void __insert_event(struct ceph_osd_client *osdc, + struct ceph_osd_event *new) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (new->cookie < event->cookie) + p = &(*p)->rb_left; + else if (new->cookie > event->cookie) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &osdc->event_tree); +} + +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, + u64 cookie) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (cookie < event->cookie) + p = &(*p)->rb_left; + else if (cookie > event->cookie) + p = &(*p)->rb_right; + else + return event; + } + return NULL; +} + +static void __remove_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + if (!RB_EMPTY_NODE(&event->node)) { + dout("__remove_event removed %p\n", event); + rb_erase(&event->node, &osdc->event_tree); + ceph_osdc_put_event(event); + } else { + dout("__remove_event didn't remove %p\n", event); + } +} + +int ceph_osdc_create_event(struct ceph_osd_client *osdc, + void (*event_cb)(u64, u64, u8, void *), + int one_shot, void *data, + struct ceph_osd_event **pevent) +{ + struct ceph_osd_event *event; + + event = kmalloc(sizeof(*event), GFP_NOIO); + if (!event) + return -ENOMEM; + + dout("create_event %p\n", event); + event->cb = event_cb; + event->one_shot = one_shot; + event->data = data; + event->osdc = osdc; + INIT_LIST_HEAD(&event->osd_node); + kref_init(&event->kref); /* one ref for us */ + kref_get(&event->kref); /* one ref for the caller */ + init_completion(&event->completion); + + spin_lock(&osdc->event_lock); + event->cookie = ++osdc->event_count; + __insert_event(osdc, event); + spin_unlock(&osdc->event_lock); + + *pevent = event; + return 0; +} +EXPORT_SYMBOL(ceph_osdc_create_event); + +void ceph_osdc_cancel_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + dout("cancel_event %p\n", event); + spin_lock(&osdc->event_lock); + __remove_event(event); + spin_unlock(&osdc->event_lock); + ceph_osdc_put_event(event); /* caller's */ +} +EXPORT_SYMBOL(ceph_osdc_cancel_event); + + +static void do_event_work(struct work_struct *work) +{ + struct ceph_osd_event_work *event_work = + container_of(work, struct ceph_osd_event_work, work); + struct ceph_osd_event *event = event_work->event; + u64 ver = event_work->ver; + u64 notify_id = event_work->notify_id; + u8 opcode = event_work->opcode; + + dout("do_event_work completing %p\n", event); + event->cb(ver, notify_id, opcode, event->data); + complete(&event->completion); + dout("do_event_work completed %p\n", event); + ceph_osdc_put_event(event); + kfree(event_work); +} + + +/* + * Process osd watch notifications + */ +void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end; + u8 proto_ver; + u64 cookie, ver, notify_id; + u8 opcode; + struct ceph_osd_event *event; + struct ceph_osd_event_work *event_work; + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_8_safe(&p, end, proto_ver, bad); + ceph_decode_8_safe(&p, end, opcode, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + ceph_decode_64_safe(&p, end, ver, bad); + ceph_decode_64_safe(&p, end, notify_id, bad); + + spin_lock(&osdc->event_lock); + event = __find_event(osdc, cookie); + if (event) { + get_event(event); + if (event->one_shot) + __remove_event(event); + } + spin_unlock(&osdc->event_lock); + dout("handle_watch_notify cookie %lld ver %lld event %p\n", + cookie, ver, event); + if (event) { + event_work = kmalloc(sizeof(*event_work), GFP_NOIO); + INIT_WORK(&event_work->work, do_event_work); + if (!event_work) { + dout("ERROR: could not allocate event_work\n"); + goto done_err; + } + event_work->event = event; + event_work->ver = ver; + event_work->notify_id = notify_id; + event_work->opcode = opcode; + if (!queue_work(osdc->notify_wq, &event_work->work)) { + dout("WARNING: failed to queue notify event work\n"); + goto done_err; + } + } + + return; + +done_err: + complete(&event->completion); + ceph_osdc_put_event(event); + return; + +bad: + pr_err("osdc handle_watch_notify corrupt msg\n"); + return; +} + +int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) +{ + int err; + + dout("wait_event %p\n", event); + err = wait_for_completion_interruptible_timeout(&event->completion, + timeout * HZ); + ceph_osdc_put_event(event); + if (err > 0) + err = 0; + dout("wait_event %p returns %d\n", event, err); + return err; +} +EXPORT_SYMBOL(ceph_osdc_wait_event); + +/* * Register request, send initial attempt. */ int ceph_osdc_start_request(struct ceph_osd_client *osdc, @@ -1347,15 +1670,22 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, * the request still han't been touched yet. */ if (req->r_sent == 0) { - rc = __send_request(osdc, req); - if (rc) { - if (nofail) { - dout("osdc_start_request failed send, " - " marking %lld\n", req->r_tid); - req->r_resend = true; - rc = 0; - } else { - __unregister_request(osdc, req); + rc = __map_request(osdc, req); + if (rc < 0) + return rc; + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + rc = __send_request(osdc, req); + if (rc) { + if (nofail) { + dout("osdc_start_request failed send, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } } } } @@ -1441,9 +1771,15 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_LIST_HEAD(&osdc->osd_lru); osdc->requests = RB_ROOT; INIT_LIST_HEAD(&osdc->req_lru); + INIT_LIST_HEAD(&osdc->req_unsent); + INIT_LIST_HEAD(&osdc->req_notarget); + INIT_LIST_HEAD(&osdc->req_linger); osdc->num_requests = 0; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); + spin_lock_init(&osdc->event_lock); + osdc->event_tree = RB_ROOT; + osdc->event_count = 0; schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); @@ -1463,6 +1799,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) "osd_op_reply"); if (err < 0) goto out_msgpool; + + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); + if (IS_ERR(osdc->notify_wq)) { + err = PTR_ERR(osdc->notify_wq); + osdc->notify_wq = NULL; + goto out_msgpool; + } return 0; out_msgpool: @@ -1476,6 +1819,8 @@ EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { + flush_workqueue(osdc->notify_wq); + destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); if (osdc->osdmap) { @@ -1483,6 +1828,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) osdc->osdmap = NULL; } remove_old_osds(osdc, 1); + WARN_ON(!RB_EMPTY_ROOT(&osdc->osds)); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); @@ -1591,6 +1937,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_OSD_OPREPLY: handle_reply(osdc, msg, con); break; + case CEPH_MSG_WATCH_NOTIFY: + handle_watch_notify(osdc, msg); + break; default: pr_err("received unknown message type %d %s\n", type, @@ -1684,6 +2033,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: + case CEPH_MSG_WATCH_NOTIFY: return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); diff --git a/net/core/dev.c b/net/core/dev.c index 0b88eba..3da9fb0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1140,9 +1140,6 @@ static int __dev_open(struct net_device *dev) ASSERT_RTNL(); - /* - * Is it even present? - */ if (!netif_device_present(dev)) return -ENODEV; @@ -1151,9 +1148,6 @@ static int __dev_open(struct net_device *dev) if (ret) return ret; - /* - * Call device private open method - */ set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) @@ -1162,31 +1156,12 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - /* - * If it went open OK then: - */ - if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - /* - * Set the flags. - */ dev->flags |= IFF_UP; - - /* - * Enable NET_DMA - */ net_dmaengine_get(); - - /* - * Initialize multicasting status - */ dev_set_rx_mode(dev); - - /* - * Wakeup transmit queue engine - */ dev_activate(dev); } @@ -1209,22 +1184,13 @@ int dev_open(struct net_device *dev) { int ret; - /* - * Is it already up? - */ if (dev->flags & IFF_UP) return 0; - /* - * Open device - */ ret = __dev_open(dev); if (ret < 0) return ret; - /* - * ... and announce new interface. - */ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_UP, dev); @@ -1240,10 +1206,6 @@ static int __dev_close_many(struct list_head *head) might_sleep(); list_for_each_entry(dev, head, unreg_list) { - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1272,15 +1234,7 @@ static int __dev_close_many(struct list_head *head) if (ops->ndo_stop) ops->ndo_stop(dev); - /* - * Device is now down. - */ - dev->flags &= ~IFF_UP; - - /* - * Shutdown NET_DMA - */ net_dmaengine_put(); } @@ -1309,9 +1263,6 @@ static int dev_close_many(struct list_head *head) __dev_close_many(head); - /* - * Tell people we are down - */ list_for_each_entry(dev, head, unreg_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_DOWN, dev); @@ -1353,14 +1304,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } + u32 flags; + + if (dev->ethtool_ops && dev->ethtool_ops->get_flags) + flags = dev->ethtool_ops->get_flags(dev); + else + flags = ethtool_op_get_flags(dev); + + if (!(flags & ETH_FLAG_LRO)) + return; + + __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); WARN_ON(dev->features & NETIF_F_LRO); } EXPORT_SYMBOL(dev_disable_lro); @@ -1368,11 +1322,6 @@ EXPORT_SYMBOL(dev_disable_lro); static int dev_boot_phase = 1; -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - /** * register_netdevice_notifier - register a network notifier block * @nb: notifier @@ -1474,6 +1423,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } +EXPORT_SYMBOL(call_netdevice_notifiers); /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); @@ -1504,6 +1454,27 @@ static inline void net_timestamp_check(struct sk_buff *skb) __net_timestamp(skb); } +static inline bool is_skb_forwardable(struct net_device *dev, + struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1527,8 +1498,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (unlikely(!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { + if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 36e603c..706502f 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -350,7 +350,7 @@ static int __init init_net_drop_monitor(void) struct per_cpu_dm_data *data; int cpu, rc; - printk(KERN_INFO "Initalizing network drop monitor service\n"); + printk(KERN_INFO "Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a1086fb..74ead9e 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -141,9 +141,24 @@ u32 ethtool_op_get_flags(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_flags); +/* Check if device can enable (or disable) particular feature coded in "data" + * argument. Flags "supported" describe features that can be toggled by device. + * If feature can not be toggled, it state (enabled or disabled) must match + * hardcoded device features state, otherwise flags are marked as invalid. + */ +bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) +{ + u32 features = dev->features & flags_dup_features; + /* "data" can contain only flags_dup_features bits, + * see __ethtool_set_flags */ + + return (features & ~supported) != (data & ~supported); +} +EXPORT_SYMBOL(ethtool_invalid_flags); + int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) { - if (data & ~supported) + if (ethtool_invalid_flags(dev, data, supported)) return -EINVAL; dev->features = ((dev->features & ~flags_dup_features) | @@ -513,7 +528,7 @@ static int ethtool_set_one_feature(struct net_device *dev, } } -static int __ethtool_set_flags(struct net_device *dev, u32 data) +int __ethtool_set_flags(struct net_device *dev, u32 data) { u32 changed; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0c55eaa..aeeece7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3761,7 +3761,10 @@ static int __init pktgen_create_thread(int cpu) list_add_tail(&t->th_list, &pktgen_threads); init_completion(&t->start_done); - p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); if (IS_ERR(p)) { pr_err("kernel_thread() failed for cpu %d\n", t->cpu); list_del(&t->th_list); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 090d273..1b74d3b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -215,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; + case ARPHRD_IPGRE: + ip_ipgre_mc_map(addr, dev->broadcast, haddr); + return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6d85800d..5345b0b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -64,6 +64,8 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include "fib_lookup.h" + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -151,6 +153,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) break; } } + if (!result) { + struct flowi4 fl4 = { .daddr = addr }; + struct fib_result res = { 0 }; + struct fib_table *local; + + /* Fallback to FIB local table so that communication + * over loopback subnets work. + */ + local = fib_get_table(net, RT_TABLE_LOCAL); + if (local && + !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && + res.type == RTN_LOCAL) + result = FIB_RES_DEV(res); + } if (result && devref) dev_hold(result); rcu_read_unlock(); @@ -345,6 +361,17 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } } + /* On promotion all secondaries from subnet are changing + * the primary IP, we must remove all their routes silently + * and later to add them back with new prefsrc. Do this + * while all addresses are on the device list. + */ + for (ifa = promote; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) + fib_del_ifaddr(ifa, ifa1); + } + /* 2. Unlink it */ *ifap = ifa1->ifa_next; @@ -364,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { + struct in_ifaddr *next_sec = promote->ifa_next; if (prev_prom) { prev_prom->ifa_next = promote->ifa_next; @@ -375,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a373a25..4510883 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -228,7 +228,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -258,7 +258,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } } @@ -722,12 +722,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) } } -static void fib_del_ifaddr(struct in_ifaddr *ifa) +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; - struct in_ifaddr *prim = ifa; + struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 @@ -735,17 +740,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; + int subnet = 0; /* Primary network */ + int gone = 1; /* Address is missing */ + int same_prefsrc = 0; /* Another primary with same IP */ - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); - else { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } + if (iprim && iprim != prim) { + printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); + return; + } + } else if (!ipv4_is_zeronet(any) && + (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); + subnet = 1; } /* Deletion is more complicated than add. @@ -755,6 +769,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1 == ifa) { + /* promotion, keep the IP */ + gone = 0; + continue; + } + /* Ignore IFAs from our subnet */ + if (iprim && ifa1->ifa_mask == iprim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, iprim)) + continue; + + /* Ignore ifa1 if it uses different primary IP (prefsrc) */ + if (ifa1->ifa_flags & IFA_F_SECONDARY) { + /* Another address from our subnet? */ + if (ifa1->ifa_mask == prim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, prim)) + prim1 = prim; + else { + /* We reached the secondaries, so + * same_prefsrc should be determined. + */ + if (!same_prefsrc) + continue; + /* Search new prim1 if ifa1 is not + * using the current prim1 + */ + if (!prim1 || + ifa1->ifa_mask != prim1->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, prim1)) + prim1 = inet_ifa_byprefix(in_dev, + ifa1->ifa_address, + ifa1->ifa_mask); + if (!prim1) + continue; + if (prim1->ifa_local != prim->ifa_local) + continue; + } + } else { + if (prim->ifa_local != ifa1->ifa_local) + continue; + prim1 = ifa1; + if (prim != prim1) + same_prefsrc = 1; + } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) @@ -763,19 +820,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; + /* primary has network specific broadcasts */ + if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { + __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; + __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + + if (!ipv4_is_zeronet(any1)) { + if (ifa->ifa_broadcast == brd1 || + ifa->ifa_broadcast == any1) + ok |= BRD_OK; + if (brd == brd1 || brd == any1) + ok |= BRD1_OK; + if (any == brd1 || any == any1) + ok |= BRD0_OK; + } + } } if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok & BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok & BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (subnet && ifa->ifa_prefixlen < 31) { + if (!(ok & BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok & BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + } if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + if (gone && + inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -885,6 +960,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: @@ -892,12 +968,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - fib_update_nh_saddrs(dev); + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_del_ifaddr(ifa); - fib_update_nh_saddrs(dev); + fib_del_ifaddr(ifa, NULL); + atomic_inc(&net->ipv4.dev_addr_genid); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. * Disable IP. @@ -915,6 +991,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); @@ -932,6 +1009,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: @@ -990,6 +1068,7 @@ static void ip_fib_net_exit(struct net *net) fib4_rules_exit(net); #endif + rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; @@ -1002,6 +1081,7 @@ static void ip_fib_net_exit(struct net *net) fib_free_table(tb); } } + rtnl_unlock(); kfree(net->ipv4.fib_table_hash); } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 4ec3238..af0f14a 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -10,7 +10,6 @@ struct fib_alias { struct fib_info *fa_info; u8 fa_tos; u8 fa_type; - u8 fa_scope; u8 fa_state; struct rcu_head rcu; }; @@ -29,7 +28,7 @@ extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 622ac4c..641a5a2 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -222,7 +222,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; - val ^= fi->fib_protocol; + val ^= (fi->fib_protocol << 8) | fi->fib_scope; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { @@ -248,10 +248,11 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, - sizeof(fi->fib_metrics)) == 0 && + sizeof(u32) * RTAX_MAX) == 0 && ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; @@ -328,7 +329,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, goto errout; err = fib_dump_info(skb, info->pid, seq, event, tb_id, - fa->fa_type, fa->fa_scope, key, dst_len, + fa->fa_type, key, dst_len, fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -695,6 +696,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_parent->fib_scope); + nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + + return nh->nh_saddr; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -753,6 +764,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; + fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; @@ -854,10 +866,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } change_nexthops(fi) { - nexthop_nh->nh_cfg_scope = cfg->fc_scope; - nexthop_nh->nh_saddr = inet_select_addr(nexthop_nh->nh_dev, - nexthop_nh->nh_gw, - nexthop_nh->nh_cfg_scope); + fib_info_update_nh_saddr(net, nexthop_nh); } endfor_nexthops(fi) link_it: @@ -906,7 +915,7 @@ failure: } int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; @@ -928,7 +937,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; - rtm->rtm_scope = scope; + rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len) @@ -1084,7 +1093,7 @@ void fib_select_default(struct fib_result *res) list_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; - if (fa->fa_scope != res->scope || + if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; @@ -1128,24 +1137,6 @@ out: return; } -void fib_update_nh_saddrs(struct net_device *dev) -{ - struct hlist_head *head; - struct hlist_node *node; - struct fib_nh *nh; - unsigned int hash; - - hash = fib_devindex_hashfn(dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_for_each_entry(nh, node, head, nh_hash) { - if (nh->nh_dev != dev) - continue; - nh->nh_saddr = inet_select_addr(nh->nh_dev, - nh->nh_gw, - nh->nh_cfg_scope); - } -} - #ifdef CONFIG_IP_ROUTE_MULTIPATH /* diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3d28a35..b92c86f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1245,7 +1245,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && fa->fa_info == fi) { fa_match = fa; break; @@ -1271,7 +1270,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_tos = fa->fa_tos; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1308,7 +1306,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; /* * Insert new entry to the list. @@ -1362,15 +1359,15 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; - if (fa->fa_scope < flp->flowi4_scope) + if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (err) { #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_miss++; + t->stats.semantic_match_passed++; #endif - return 1; + return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; @@ -1388,7 +1385,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, res->prefixlen = plen; res->nh_sel = nhsel; res->type = fa->fa_type; - res->scope = fa->fa_scope; + res->scope = fa->fa_info->fib_scope; res->fi = fi; res->table = tb; res->fa_head = &li->falh; @@ -1664,7 +1661,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && + fa->fa_info->fib_scope == cfg->fc_scope) && + (!cfg->fc_prefsrc || + fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(cfg, fi) == 0) { @@ -1861,7 +1860,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, RTM_NEWROUTE, tb->tb_id, fa->fa_type, - fa->fa_scope, xkey, plen, fa->fa_tos, @@ -2382,7 +2380,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth+1); seq_printf(seq, " /%d %s %s", li->plen, rtn_scope(buf1, sizeof(buf1), - fa->fa_scope), + fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_tos) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1906fa3..28a736f 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -140,11 +140,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) } else { dopt->ts_needtime = 0; - if (soffset + 8 <= optlen) { + if (soffset + 7 <= optlen) { __be32 addr; - memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { + memcpy(&addr, dptr+soffset-1, 4); + if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e837ffd..2d3c72e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -569,6 +569,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); + rt = NULL; goto done; } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 870b518..4b0c811 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1593,8 +1593,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) rt->rt_peer_genid = rt_peer_genid(); } check_peer_pmtu(dst, peer); - - inet_putpeer(peer); } } @@ -1720,7 +1718,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) - src = FIB_RES_PREFSRC(res); + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); @@ -2617,7 +2615,7 @@ static struct rtable *ip_route_output_slow(struct net *net, fib_select_default(&res); if (!fl4.saddr) - fl4.saddr = FIB_RES_PREFSRC(res); + fl4.saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); fl4.flowi4_oif = dev_out->ifindex; @@ -3221,6 +3219,8 @@ static __net_init int rt_genid_init(struct net *net) { get_random_bytes(&net->ipv4.rt_genid, sizeof(net->ipv4.rt_genid)); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da782e7..bef9f04 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2659,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const int undo) +static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); @@ -2671,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } - tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2699,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else @@ -2726,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } @@ -2779,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, 0); + tcp_undo_cwr(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -2808,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; @@ -2822,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk) static inline void tcp_complete_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_stamp = tcp_time_stamp; + /* Do not moderate cwnd if it's already undone in cwr or recovery */ + if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; + } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -3494,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); } /* F-RTO spurious RTO detection algorithm (RFC4138) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7ff0343..29e4859 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -663,7 +663,7 @@ static int pim6_rcv(struct sk_buff *skb) skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); - skb->ip_summed = 0; + skb->ip_summed = CHECKSUM_NONE; skb->pkt_type = PACKET_HOST; skb_tunnel_rx(skb, reg_dev); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0e49c9d..92f952d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -341,6 +341,8 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; + case ARPHRD_IPGRE: + return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6814c87..843406f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -854,7 +854,7 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, struct sock *sk, +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { int flags = 0; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 7cb65ef..6dcf5e7 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,16 @@ static struct ctl_table empty[1]; +static ctl_table ipv6_static_skeleton[] = { + { + .procname = "neigh", + .maxlen = 0, + .mode = 0555, + .child = empty, + }, + { } +}; + static ctl_table ipv6_table_template[] = { { .procname = "route", @@ -37,12 +47,6 @@ static ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "neigh", - .maxlen = 0, - .mode = 0555, - .child = empty, - }, { } }; @@ -160,7 +164,7 @@ static struct ctl_table_header *ip6_base; int ipv6_static_sysctl_register(void) { - ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty); + ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton); if (ip6_base == NULL) return -ENOMEM; return 0; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 2731b51..9680226 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -148,7 +148,6 @@ static void ipx_destroy_socket(struct sock *sk) ipx_remove_socket(sk); skb_queue_purge(&sk->sk_receive_queue); sk_refcnt_debug_dec(sk); - sock_put(sk); } /* @@ -1404,6 +1403,7 @@ static int ipx_release(struct socket *sock) sk_refcnt_debug_release(sk); ipx_destroy_socket(sk); release_sock(sk); + sock_put(sk); out: return 0; } diff --git a/net/irda/iriap.c b/net/irda/iriap.c index 5b743bd..3647753 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -656,10 +656,16 @@ static void iriap_getvaluebyclass_indication(struct iriap_cb *self, n = 1; name_len = fp[n++]; + + IRDA_ASSERT(name_len < IAS_MAX_CLASSNAME + 1, return;); + memcpy(name, fp+n, name_len); n+=name_len; name[name_len] = '\0'; attr_len = fp[n++]; + + IRDA_ASSERT(attr_len < IAS_MAX_ATTRIBNAME + 1, return;); + memcpy(attr, fp+n, attr_len); n+=attr_len; attr[attr_len] = '\0'; diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 7c567b8..2bb2beb 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -105,6 +105,9 @@ irnet_ctrl_write(irnet_socket * ap, while(isspace(start[length - 1])) length--; + DABORT(length < 5 || length > NICKNAME_MAX_LEN + 5, + -EINVAL, CTRL_ERROR, "Invalid nickname.\n"); + /* Copy the name for later reuse */ memcpy(ap->rname, start + 5, length - 5); ap->rname[length - 5] = '\0'; diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 8d9ce0a..a8193f5 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -283,7 +283,7 @@ static __net_init int l2tp_eth_init_net(struct net *net) return 0; } -static __net_initdata struct pernet_operations l2tp_eth_net_ops = { +static struct pernet_operations l2tp_eth_net_ops = { .init = l2tp_eth_init_net, .id = &l2tp_eth_net_id, .size = sizeof(struct l2tp_eth_net), diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 5c48ffb..2dc6de1 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -43,6 +43,8 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); +static DEFINE_MUTEX(__ip_vs_app_mutex); + /* * Get an ip_vs_app object */ @@ -167,14 +169,13 @@ int register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, __u16 port) { - struct netns_ipvs *ipvs = net_ipvs(net); int result; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); result = ip_vs_app_inc_new(net, app, proto, port); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return result; } @@ -189,11 +190,11 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) /* increase the module use count */ ip_vs_use_count_inc(); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_add(&app->a_list, &ipvs->app_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return 0; } @@ -205,10 +206,9 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *inc, *nxt; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { ip_vs_app_inc_release(net, inc); @@ -216,7 +216,7 @@ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) list_del(&app->a_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -501,7 +501,7 @@ static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } @@ -535,9 +535,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { - struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq)); - - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) @@ -583,7 +581,6 @@ static int __net_init __ip_vs_app_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b799cea..33733c8 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3605,7 +3605,7 @@ int __net_init __ip_vs_control_init(struct net *net) /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (ipvs->tot_stats.cpustats) { + if (!ipvs->tot_stats.cpustats) { pr_err("%s(): alloc_percpu.\n", __func__); return -ENOMEM; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 75ea686..6daaa49 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -33,8 +33,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> - -#include <asm-generic/bitops/le.h> +#include <linux/bitops.h> #include "rds.h" @@ -285,7 +284,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___set_le_bit(off, (void *)map->m_page_addrs[i]); + __set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) @@ -299,7 +298,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); + __clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) @@ -310,7 +309,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); + return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5ee0c62..a80aef6 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -978,7 +978,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros struct sock *make; struct rose_sock *make_rose; struct rose_facilities_struct facilities; - int n, len; + int n; skb->sk = NULL; /* Initially we don't know who it's for */ @@ -987,9 +987,9 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros */ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); - len = (((skb->data[3] >> 4) & 0x0F) + 1) >> 1; - len += (((skb->data[3] >> 0) & 0x0F) + 1) >> 1; - if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { + if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, + skb->len - ROSE_CALL_REQ_FACILITIES_OFF, + &facilities)) { rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; } diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index ae4a9d9..3444562 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -73,9 +73,20 @@ static void rose_loopback_timer(unsigned long param) unsigned int lci_i, lci_o; while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + if (skb->len < ROSE_MIN_LEN) { + kfree_skb(skb); + continue; + } lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); frametype = skb->data[2]; - dest = (rose_address *)(skb->data + 4); + if (frametype == ROSE_CALL_REQUEST && + (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || + skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != + ROSE_CALL_REQ_ADDR_LEN_VAL)) { + kfree_skb(skb); + continue; + } + dest = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); lci_o = ROSE_DEFAULT_MAXVC + 1 - lci_i; skb_reset_transport_header(skb); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 88a77e9..08dcd2f 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -861,7 +861,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) unsigned int lci, new_lci; unsigned char cause, diagnostic; struct net_device *dev; - int len, res = 0; + int res = 0; char buf[11]; #if 0 @@ -869,10 +869,17 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) return res; #endif + if (skb->len < ROSE_MIN_LEN) + return res; frametype = skb->data[2]; lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); - src_addr = (rose_address *)(skb->data + 9); - dest_addr = (rose_address *)(skb->data + 4); + if (frametype == ROSE_CALL_REQUEST && + (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || + skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != + ROSE_CALL_REQ_ADDR_LEN_VAL)) + return res; + src_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_SRC_ADDR_OFF); + dest_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); spin_lock_bh(&rose_neigh_list_lock); spin_lock_bh(&rose_route_list_lock); @@ -1010,12 +1017,11 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) goto out; } - len = (((skb->data[3] >> 4) & 0x0F) + 1) >> 1; - len += (((skb->data[3] >> 0) & 0x0F) + 1) >> 1; - memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); - if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { + if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, + skb->len - ROSE_CALL_REQ_FACILITIES_OFF, + &facilities)) { rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76); goto out; } diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index 1734abb..f6c71ca 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -142,7 +142,7 @@ void rose_write_internal(struct sock *sk, int frametype) *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; - *dptr++ = 0xAA; + *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL; memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); @@ -246,12 +246,16 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities_struct * do { switch (*p & 0xC0) { case 0x00: + if (len < 2) + return -1; p += 2; n += 2; len -= 2; break; case 0x40: + if (len < 3) + return -1; if (*p == FAC_NATIONAL_RAND) facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); p += 3; @@ -260,40 +264,61 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities_struct * break; case 0x80: + if (len < 4) + return -1; p += 4; n += 4; len -= 4; break; case 0xC0: + if (len < 2) + return -1; l = p[1]; + if (len < 2 + l) + return -1; if (*p == FAC_NATIONAL_DEST_DIGI) { if (!fac_national_digis_received) { + if (l < AX25_ADDR_LEN) + return -1; memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); facilities->source_ndigis = 1; } } else if (*p == FAC_NATIONAL_SRC_DIGI) { if (!fac_national_digis_received) { + if (l < AX25_ADDR_LEN) + return -1; memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); facilities->dest_ndigis = 1; } } else if (*p == FAC_NATIONAL_FAIL_CALL) { + if (l < AX25_ADDR_LEN) + return -1; memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); } else if (*p == FAC_NATIONAL_FAIL_ADD) { + if (l < 1 + ROSE_ADDR_LEN) + return -1; memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); } else if (*p == FAC_NATIONAL_DIGIS) { + if (l % AX25_ADDR_LEN) + return -1; fac_national_digis_received = 1; facilities->source_ndigis = 0; facilities->dest_ndigis = 0; for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { - if (pt[6] & AX25_HBIT) + if (pt[6] & AX25_HBIT) { + if (facilities->dest_ndigis >= ROSE_MAX_DIGIS) + return -1; memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); - else + } else { + if (facilities->source_ndigis >= ROSE_MAX_DIGIS) + return -1; memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); + } } } p += l + 2; @@ -314,25 +339,38 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *fac do { switch (*p & 0xC0) { case 0x00: + if (len < 2) + return -1; p += 2; n += 2; len -= 2; break; case 0x40: + if (len < 3) + return -1; p += 3; n += 3; len -= 3; break; case 0x80: + if (len < 4) + return -1; p += 4; n += 4; len -= 4; break; case 0xC0: + if (len < 2) + return -1; l = p[1]; + + /* Prevent overflows*/ + if (l < 10 || l > 20) + return -1; + if (*p == FAC_CCITT_DEST_NSAP) { memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); @@ -355,45 +393,44 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *fac return n; } -int rose_parse_facilities(unsigned char *p, +int rose_parse_facilities(unsigned char *p, unsigned packet_len, struct rose_facilities_struct *facilities) { int facilities_len, len; facilities_len = *p++; - if (facilities_len == 0) + if (facilities_len == 0 || (unsigned)facilities_len > packet_len) return 0; - while (facilities_len > 0) { - if (*p == 0x00) { - facilities_len--; - p++; - - switch (*p) { - case FAC_NATIONAL: /* National */ - len = rose_parse_national(p + 1, facilities, facilities_len - 1); - facilities_len -= len + 1; - p += len + 1; - break; - - case FAC_CCITT: /* CCITT */ - len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); - facilities_len -= len + 1; - p += len + 1; - break; - - default: - printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); - facilities_len--; - p++; - break; - } - } else - break; /* Error in facilities format */ + while (facilities_len >= 3 && *p == 0x00) { + facilities_len--; + p++; + + switch (*p) { + case FAC_NATIONAL: /* National */ + len = rose_parse_national(p + 1, facilities, facilities_len - 1); + break; + + case FAC_CCITT: /* CCITT */ + len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); + break; + + default: + printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); + len = 1; + break; + } + + if (len < 0) + return 0; + if (WARN_ON(len >= facilities_len)) + return 0; + facilities_len -= len + 1; + p += len + 1; } - return 1; + return facilities_len == 0; } static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 152976e..d5bf91d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1205,7 +1205,7 @@ SCTP_STATIC __init int sctp_init(void) if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0) continue; sctp_assoc_hashtable = (struct sctp_hashbucket *) - __get_free_pages(GFP_ATOMIC, order); + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); } while (!sctp_assoc_hashtable && --order > 0); if (!sctp_assoc_hashtable) { pr_err("Failed association hash alloc\n"); @@ -1238,7 +1238,7 @@ SCTP_STATIC __init int sctp_init(void) if ((sctp_port_hashsize > (64 * 1024)) && order > 0) continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC, order); + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 8b40610..e3c36a2 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -160,6 +160,28 @@ gss_mech_get_by_name(const char *name) EXPORT_SYMBOL_GPL(gss_mech_get_by_name); +struct gss_api_mech * +gss_mech_get_by_OID(struct xdr_netobj *obj) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (obj->len == pos->gm_oid.len) { + if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL_GPL(gss_mech_get_by_OID); + static inline int mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) { @@ -193,6 +215,22 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); +int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) +{ + struct gss_api_mech *pos = NULL; + int i = 0; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + array_ptr[i] = pos->gm_pfs->pseudoflavor; + i++; + } + spin_unlock(®istered_mechs_lock); + return i; +} + +EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors); + u32 gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index ffb6876..6b43ee7 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -860,8 +860,10 @@ static void rpc_release_resources_task(struct rpc_task *task) { if (task->tk_rqstp) xprt_release(task); - if (task->tk_msg.rpc_cred) + if (task->tk_msg.rpc_cred) { put_rpccred(task->tk_msg.rpc_cred); + task->tk_msg.rpc_cred = NULL; + } rpc_task_release_client(task); } diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 30916b0..c8e1021 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -38,6 +38,14 @@ struct unix_domain { extern struct auth_ops svcauth_unix; +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + struct unix_domain *ud = container_of(dom, struct unix_domain, h); + + kfree(dom->name); + kfree(ud); +} + struct auth_domain *unix_domain_find(char *name) { struct auth_domain *rv; @@ -47,7 +55,7 @@ struct auth_domain *unix_domain_find(char *name) while(1) { if (rv) { if (new && rv != &new->h) - auth_domain_put(&new->h); + svcauth_unix_domain_release(&new->h); if (rv->flavour != &svcauth_unix) { auth_domain_put(rv); @@ -74,14 +82,6 @@ struct auth_domain *unix_domain_find(char *name) } EXPORT_SYMBOL_GPL(unix_domain_find); -static void svcauth_unix_domain_release(struct auth_domain *dom) -{ - struct unix_domain *ud = container_of(dom, struct unix_domain, h); - - kfree(dom->name); - kfree(ud); -} - /************************************************** * cache for IP address to unix_domain diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index be96d42..1e336a0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -710,6 +710,8 @@ static void xs_reset_transport(struct sock_xprt *transport) if (sk == NULL) return; + transport->srcport = 0; + write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 872065c..a026b0e 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -173,7 +173,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop_unlock; } - if (x->props.replay_window && x->repl->check(x, skb, seq)) { + if (x->repl->check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } @@ -190,6 +190,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + skb_dst_force(skb); + nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 1aba03f..47bacd8 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -78,6 +78,8 @@ static int xfrm_output_one(struct sk_buff *skb, int err) spin_unlock_bh(&x->lock); + skb_dst_force(skb); + err = x->type->output(x, skb); if (err == -EINPROGRESS) goto out_exit; @@ -94,7 +96,7 @@ resume: err = -EHOSTUNREACH; goto error_nolock; } - skb_dst_set(skb, dst_clone(dst)); + skb_dst_set(skb, dst); x = dst->xfrm; } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 2f5be5b..f218385 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -118,6 +118,9 @@ static int xfrm_replay_check(struct xfrm_state *x, u32 diff; u32 seq = ntohl(net_seq); + if (!x->props.replay_window) + return 0; + if (unlikely(seq == 0)) goto err; @@ -193,9 +196,14 @@ static int xfrm_replay_check_bmp(struct xfrm_state *x, { unsigned int bitnr, nr; struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 pos; u32 seq = ntohl(net_seq); u32 diff = replay_esn->seq - seq; - u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window; + + if (!replay_esn->replay_window) + return 0; + + pos = (replay_esn->seq - 1) % replay_esn->replay_window; if (unlikely(seq == 0)) goto err; @@ -373,12 +381,17 @@ static int xfrm_replay_check_esn(struct xfrm_state *x, unsigned int bitnr, nr; u32 diff; struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 pos; u32 seq = ntohl(net_seq); - u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window; u32 wsize = replay_esn->replay_window; u32 top = replay_esn->seq; u32 bottom = top - wsize + 1; + if (!wsize) + return 0; + + pos = (replay_esn->seq - 1) % replay_esn->replay_window; + if (unlikely(seq == 0 && replay_esn->seq_hi == 0 && (replay_esn->seq < replay_esn->replay_window - 1))) goto err; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d575f05..dd78536 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1181,6 +1181,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) goto error; } + if (orig->replay_esn) { + err = xfrm_replay_clone(x, orig); + if (err) + goto error; + } + memcpy(&x->mark, &orig->mark, sizeof(x->mark)); err = xfrm_init_state(x); @@ -1907,7 +1913,7 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu) return res; } -int xfrm_init_state(struct xfrm_state *x) +int __xfrm_init_state(struct xfrm_state *x, bool init_replay) { struct xfrm_state_afinfo *afinfo; struct xfrm_mode *inner_mode; @@ -1980,12 +1986,25 @@ int xfrm_init_state(struct xfrm_state *x) if (x->outer_mode == NULL) goto error; + if (init_replay) { + err = xfrm_init_replay(x); + if (err) + goto error; + } + x->km.state = XFRM_STATE_VALID; error: return err; } +EXPORT_SYMBOL(__xfrm_init_state); + +int xfrm_init_state(struct xfrm_state *x) +{ + return __xfrm_init_state(x, true); +} + EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 706385a..3d15d3e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -127,6 +127,9 @@ static inline int verify_replay(struct xfrm_usersa_info *p, if (!rt) return 0; + if (p->id.proto != IPPROTO_ESP) + return -EINVAL; + if (p->replay_window != 0) return -EINVAL; @@ -360,6 +363,23 @@ static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, return 0; } +static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn, + struct nlattr *rp) +{ + struct xfrm_replay_state_esn *up; + + if (!replay_esn || !rp) + return 0; + + up = nla_data(rp); + + if (xfrm_replay_state_esn_len(replay_esn) != + xfrm_replay_state_esn_len(up)) + return -EINVAL; + + return 0; +} + static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn, struct xfrm_replay_state_esn **preplay_esn, struct nlattr *rta) @@ -511,7 +531,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - err = xfrm_init_state(x); + err = __xfrm_init_state(x, false); if (err) goto error; @@ -1766,6 +1786,10 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, if (x->km.state != XFRM_STATE_VALID) goto out; + err = xfrm_replay_verify_len(x->replay_esn, rp); + if (err) + goto out; + spin_lock_bh(&x->lock); xfrm_update_ae_params(x, attrs); spin_unlock_bh(&x->lock); |