From 04600794958f1833f5571c6cde40f260ab557f55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2010 17:45:15 +0200 Subject: cfg80211: support sysfs namespaces Enable using network namespaces with wireless devices even when sysfs is enabled using the same infrastructure that was built for netdevs. Signed-off-by: Johannes Berg Acked-by: "Eric W. Biederman" Signed-off-by: John W. Linville --- net/core/net-sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index af4dfba..7d74854 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -789,12 +789,13 @@ static const void *net_netlink_ns(struct sock *sk) return sock_net(sk); } -static struct kobj_ns_type_operations net_ns_type_operations = { +struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_ns = net_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, }; +EXPORT_SYMBOL_GPL(net_ns_type_operations); static void net_kobj_ns_exit(struct net *net) { -- cgit v1.1 From bfb564e7391340638afe4ad67744a8f3858e7566 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Wed, 4 Aug 2010 06:15:52 +0000 Subject: core: Factor out flow calculation from get_rps_cpu Factor out flow calculation code from get_rps_cpu, since other functions can use the same code. Revisions: v2 (Ben): Separate flow calcuation out and use in select queue. v3 (Arnd): Don't re-implement MIN. v4 (Changli): skb->data points to ethernet header in macvtap, and make a fast path. Tested macvtap with this patch. v5 (Changli): - Cache skb->rxhash in skb_get_rxhash - macvtap may not have pow(2) queues, so change code for queue selection. (Arnd): - Use first available queue if all fails. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 106 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 44 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1ae6543..586a11c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2259,69 +2259,41 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); - /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb) { + int nhoff, hash = 0; struct ipv6hdr *ip6; struct iphdr *ip; - struct netdev_rx_queue *rxqueue; - struct rps_map *map; - struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; - int cpu = -1; u8 ip_proto; - u16 tcpu; u32 addr1, addr2, ihl; union { u32 v32; u16 v16[2]; } ports; - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); - goto done; - } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; - - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) - goto done; - - if (skb->rxhash) - goto got_hash; /* Skip hash computation on packet header */ + nhoff = skb_network_offset(skb); switch (skb->protocol) { case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip))) + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data; + ip = (struct iphdr *) skb->data + nhoff; ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6))) + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data; + ip6 = (struct ipv6hdr *) skb->data + nhoff; ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,6 +2302,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, default: goto done; } + switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -2338,8 +2311,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) { - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff + + (ihl * 4)); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); break; @@ -2352,11 +2326,55 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); - if (!skb->rxhash) - skb->rxhash = 1; -got_hash: + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " + "on queue %u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + goto done; + + if (!skb_get_rxhash(skb)) + goto done; + flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { -- cgit v1.1 From 01414802054c382072b6cb9a1bdc6e243c74b2d5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 17 Aug 2010 02:31:15 -0700 Subject: ethtool: Provide a default implementation of ethtool_ops::get_drvinfo The driver name and bus address for a net_device can normally be found through the driver model now. Instead of requiring drivers to provide this information redundantly through the ethtool_ops::get_drvinfo operation, use the driver model to do so if the driver does not define the operation. Since ETHTOOL_GDRVINFO no longer requires the driver to implement any operations, do not require net_device::ethtool_ops to be set either. Remove implementations of get_drvinfo and ethtool_ops that provide only this information. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 7a85367..d2c4da5 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -205,18 +205,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!ops->get_drvinfo) - return -EOPNOTSUPP; - memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - ops->get_drvinfo(dev, &info); + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ - if (ops->get_sset_count) { + if (ops && ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -229,9 +235,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) + if (ops && ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops->get_eeprom_len) + if (ops && ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -1402,12 +1408,19 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + /* Allow some commands to be done by anyone */ switch (ethcmd) { case ETHTOOL_GDRVINFO: -- cgit v1.1 From 2244d07bfa2097cb00600da91c715a8aa547917e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 17 Aug 2010 08:59:14 +0000 Subject: net: simplify flags for tx timestamping This patch removes the abstraction introduced by the union skb_shared_tx in the shared skb data. The access of the different union elements at several places led to some confusion about accessing the shared tx_flags e.g. in skb_orphan_try(). http://marc.info/?l=linux-netdev&m=128084897415886&w=2 Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/skbuff.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 586a11c..c1dc8a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1902,14 +1902,14 @@ static int dev_gso_segment(struct sk_buff *skb) /* * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested, since - * drivers need to call skb_tstamp_tx() to send the timestamp. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c */ static inline void skb_orphan_try(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && !skb_tx(skb)->flags) { + if (sk && !skb_shinfo(skb)->tx_flags) { /* skb_tx_hash() wont be able to get sk. * We copy sk_hash into skb->rxhash */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3a2513f..99ef721 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3016,7 +3016,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } else { /* * no hardware time stamps available, - * so keep the skb_shared_tx and only + * so keep the shared tx_flags and only * store software time stamp */ skb->tstamp = ktime_get_real(); -- cgit v1.1 From 2d47b45951af087c1a4439c559309b0bf90a0718 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:00:56 +0000 Subject: net: rps: reset network header before calling skb_get_rxhash() skb_get_rxhash() assumes the network header pointer of the skb is set properly after the commit: commit bfb564e7391340638afe4ad67744a8f3858e7566 Author: Krishna Kumar Date: Wed Aug 4 06:15:52 2010 +0000 core: Factor out flow calculation from get_rps_cpu Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c1dc8a9..cf87fde 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2372,6 +2372,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!rxqueue->rps_map && !rxqueue->rps_flow_table) goto done; + skb_reset_network_header(skb); if (!skb_get_rxhash(skb)) goto done; -- cgit v1.1 From dbe5775bbc00116ed5699babfe17c54f32eb34c3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:01:38 +0000 Subject: net: rps: skip fragment when computing rxhash Fragmented IP packets may have no transfer header, so when computing rxhash, we should skip them. Signed-off-by: Changli Gao Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cf87fde..7e97e89 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2284,7 +2284,10 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) goto done; ip = (struct iphdr *) skb->data + nhoff; - ip_proto = ip->protocol; + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; -- cgit v1.1 From 12fcdefb3643607c47f39906a49056cf608bb545 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:04:32 +0000 Subject: net: rps: use proto_ports_offset() to handle the AH message correctly The SPI isn't at the beginning of an AH message. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7e97e89..da584f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2266,7 +2266,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ __u32 __skb_get_rxhash(struct sk_buff *skb) { - int nhoff, hash = 0; + int nhoff, hash = 0, poff; struct ipv6hdr *ip6; struct iphdr *ip; u8 ip_proto; @@ -2306,24 +2306,15 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) goto done; } - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { - ports.v32 = * (__force u32 *) (skb->data + nhoff + - (ihl * 4)); + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); - break; } - default: - ports.v32 = 0; - break; } /* get a consistent hash (same value on both flow directions) */ -- cgit v1.1 From 1003489e06c04d807c783a8958f2ccc9aed7a244 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 21 Aug 2010 06:13:28 +0000 Subject: net: rps: fix the wrong network header pointer __skb_get_rxhash() was broken after the commit: commit bfb564e7391340638afe4ad67744a8f3858e7566 Author: Krishna Kumar Date: Wed Aug 4 06:15:52 2010 +0000 core: Factor out flow calculation from get_rps_cpu Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index da584f5..4d74d26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2283,7 +2283,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data + nhoff; + ip = (struct iphdr *) (skb->data + nhoff); if (ip->frag_off & htons(IP_MF | IP_OFFSET)) ip_proto = 0; else @@ -2296,7 +2296,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data + nhoff; + ip6 = (struct ipv6hdr *) (skb->data + nhoff); ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; -- cgit v1.1 From 05532121da0728eaedac2a0a5c3cecad3a95d765 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 22 Aug 2010 21:03:33 -0700 Subject: net: 802.1q: make vlan_hwaccel_do_receive() return void vlan_hwaccel_do_receive() always returns 0, so make it return void. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7cd5237..d569f88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2841,8 +2841,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; + if (vlan_tx_tag_present(skb)) + vlan_hwaccel_do_receive(skb); /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) -- cgit v1.1 From 21dc330157454046dd7c494961277d76e1c957fe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Aug 2010 00:13:46 -0700 Subject: net: Rename skb_has_frags to skb_has_frag_list SKBs can be "fragmented" in two ways, via a page array (called skb_shinfo(skb)->frags[]) and via a list of SKBs (called skb_shinfo(skb)->frag_list). Since skb_has_frags() tests the latter, it's name is confusing since it sounds more like it's testing the former. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/skbuff.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d569f88..859e30f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1930,7 +1930,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { return skb_is_nonlinear(skb) && - ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -3090,7 +3090,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) goto normal; - if (skb_is_gso(skb) || skb_has_frags(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; rcu_read_lock(); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 99ef721..e2535fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,7 +340,7 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); kfree(skb->head); @@ -759,7 +759,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(n)->nr_frags = i; } - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } @@ -822,7 +822,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb); @@ -1099,7 +1099,7 @@ drop_pages: for (; i < nfrags; i++) put_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } @@ -1194,7 +1194,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ - if (!skb_has_frags(skb)) + if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ @@ -2323,7 +2323,7 @@ next_skb: st->frag_data = NULL; } - if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; @@ -2889,7 +2889,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; /* Easy case. Most of packets will go this way. */ - if (!skb_has_frags(skb)) { + if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more @@ -2924,7 +2924,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1) || + skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } @@ -2933,7 +2933,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1)) { + skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ -- cgit v1.1 From afdcba371f9748ac91608bb6c57f170aab7085b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Aug 2010 07:14:36 +0000 Subject: net: copy_rtnl_link_stats64() simplification No need to use a temporary struct rtnl_link_stats64 variable, just copy the source to skb buffer. Signed-off-by: Eric Dumazet Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f78d821..b2a718d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -612,36 +612,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) { - struct rtnl_link_stats64 a; - - a.rx_packets = b->rx_packets; - a.tx_packets = b->tx_packets; - a.rx_bytes = b->rx_bytes; - a.tx_bytes = b->tx_bytes; - a.rx_errors = b->rx_errors; - a.tx_errors = b->tx_errors; - a.rx_dropped = b->rx_dropped; - a.tx_dropped = b->tx_dropped; - - a.multicast = b->multicast; - a.collisions = b->collisions; - - a.rx_length_errors = b->rx_length_errors; - a.rx_over_errors = b->rx_over_errors; - a.rx_crc_errors = b->rx_crc_errors; - a.rx_frame_errors = b->rx_frame_errors; - a.rx_fifo_errors = b->rx_fifo_errors; - a.rx_missed_errors = b->rx_missed_errors; - - a.tx_aborted_errors = b->tx_aborted_errors; - a.tx_carrier_errors = b->tx_carrier_errors; - a.tx_fifo_errors = b->tx_fifo_errors; - a.tx_heartbeat_errors = b->tx_heartbeat_errors; - a.tx_window_errors = b->tx_window_errors; - - a.rx_compressed = b->rx_compressed; - a.tx_compressed = b->tx_compressed; - memcpy(v, &a, sizeof(a)); + memcpy(v, b, sizeof(*b)); } /* All VF info */ -- cgit v1.1 From 0fdc100bdc4b7ab61ed632962c76dfe539047296 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 23 Aug 2010 10:24:18 +0000 Subject: ethtool: allow non-netadmin to query settings The SNMP daemon uses ethtool to determine the speed of network interfaces. This fails on Debian (and probably elsewhere) because for security SNMP daemon runs as non-root user (snmp). Note: A similar patch was rejected previously because of a concern about the possibility that on some hardware querying the ethtool settings requires access to the PHY and could slow the machine down. But the security risk of requiring SNMP daemon (and related services) to run as root far out weighs the risk of denial-of-service. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d2c4da5..970eb98 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1423,6 +1423,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) /* Allow some commands to be done by anyone */ switch (ethcmd) { + case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GCOALESCE: -- cgit v1.1 From 40d0802b3eb47d57e2d57a5244a18cbbe9632e13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Aug 2010 22:03:08 -0700 Subject: gro: __napi_gro_receive() optimizations compare_ether_header() can have a special implementation on 64 bit arches if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined. __napi_gro_receive() and vlan_gro_common() can avoid a conditional branch to perform device match. On x86_64, __napi_gro_receive() has now 38 instructions instead of 53 As gcc-4.4.3 still choose to not inline it, add inline keyword to this performance critical function. Signed-off-by: Eric Dumazet CC: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 859e30f..63bd20a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3169,16 +3169,18 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static gro_result_t +static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - (p->dev == skb->dev) && - !compare_ether_header(skb_mac_header(p), + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.1 From ba4fd9d8282f7f856f2287fe8be784d1dfdda28b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 01:57:35 +0000 Subject: pktgen: remove non used variable remove non used variable "queue" in pg_cleanup Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 10a1ea7..386c228 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3907,8 +3907,6 @@ static void __exit pg_cleanup(void) { struct pktgen_thread *t; struct list_head *q, *n; - wait_queue_head_t queue; - init_waitqueue_head(&queue); /* Stop all interfaces & threads */ -- cgit v1.1 From 86cac58b71227cc34a3d0e78f19585c0eff49ea3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 18:25:32 +0000 Subject: skge: add GRO support - napi_gro_flush() is exported from net/core/dev.c, to avoid an irq_save/irq_restore in the packet receive path. - use napi_gro_receive() instead of netif_receive_skb() - use napi_gro_flush() before calling __napi_complete() - turn on NETIF_F_GRO by default - Tested on a Marvell 88E8001 Gigabit NIC Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 63bd20a..d8c43e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3063,7 +3063,7 @@ out: return netif_receive_skb(skb); } -static void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -3076,6 +3076,7 @@ static void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } +EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.1 From 6602cebb5bcac1fccf2850541f8bf9fcc8c86dee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Sep 2010 05:25:10 +0000 Subject: net: skbuff.c cleanup (skb->data - skb->head) can be changed by skb_headroom(skb) Remove some uses of NET_SKBUFF_DATA_USES_OFFSET, using (skb_end_pointer(skb) - skb->head) or (skb_tail_pointer(skb) - skb->head) : compiler does the right thing, and this is more readable for us ;) (struct skb_shared_info *) casts in pskb_expand_head() to help memcpy() to use aligned moves. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e2535fb..231dff0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -685,16 +685,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end + skb->data_len, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); -#endif + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) return NULL; @@ -726,20 +720,14 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head, gfp_mask); -#endif + unsigned int size = skb_end_pointer(skb) - skb->head; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); + skb_reserve(n, skb_headroom(skb)); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -791,11 +779,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - int size = nhead + skb->end + ntail; -#else - int size = nhead + (skb->end - skb->head) + ntail; -#endif + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; BUG_ON(nhead < 0); @@ -810,13 +794,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, goto nodata; /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - memcpy(data + nhead, skb->head, skb->tail); -#else - memcpy(data + nhead, skb->head, skb->tail - skb->head); -#endif - memcpy(data + size, skb_end_pointer(skb), + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -- cgit v1.1 From fa50d6457691d5c2d8a3430abf950435ef129cf1 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 31 Aug 2010 12:14:13 +0000 Subject: net: make rx_queue sysfs_ops const Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7d74854..76485a3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -515,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, return attribute->store(queue, attribute, buf, count); } -static struct sysfs_ops rx_queue_sysfs_ops = { +static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; -- cgit v1.1 From c07b68e841bd737e2ebeb57268d251c89ccc5010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Sep 2010 03:53:46 +0000 Subject: net: dev_add_pack() & __dev_remove_pack() changes Add a small helper ptype_head() to get the head to manipulate dev_add_pack() & __dev_remove_pack() can use a spinlock without blocking BH, since softirq use RCU, and these functions are run from process context only. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d8c43e7..efd318d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,6 +371,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) * --ANK (980803) */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + /** * dev_add_pack - add packet handler * @pt: packet type declaration @@ -386,16 +394,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) void dev_add_pack(struct packet_type *pt) { - int hash; + struct list_head *head = ptype_head(pt); - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) - list_add_rcu(&pt->list, &ptype_all); - else { - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); @@ -414,15 +417,10 @@ EXPORT_SYMBOL(dev_add_pack); */ void __dev_remove_pack(struct packet_type *pt) { - struct list_head *head; + struct list_head *head = ptype_head(pt); struct packet_type *pt1; - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) - head = &ptype_all; - else - head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -433,7 +431,7 @@ void __dev_remove_pack(struct packet_type *pt) printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); out: - spin_unlock_bh(&ptype_lock); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); -- cgit v1.1 From 52ee7a04a0f88815a71acdc604a854fb30dcbe45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Sep 2010 06:27:08 +0000 Subject: net: remove two kmemcheck annotations __alloc_skb() uses a memset() to clear all the beginning of skb, including bitfields contained in 'flags1' & 'flags2'. We dont need any more to use kmemcheck_annotate_bitfield() on these fields. However, we still need it for the clone part, which is not cleared. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 231dff0..c030cf8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -202,8 +202,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; - kmemcheck_annotate_bitfield(skb, flags1); - kmemcheck_annotate_bitfield(skb, flags2); #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->mac_header = ~0U; #endif -- cgit v1.1 From 1fd63041c49c5c6ed1fe58b7bccc2de462d51e2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Sep 2010 23:09:32 +0000 Subject: net: pskb_expand_head() optimization pskb_expand_head() blindly takes references on fragments before calling skb_release_data(), potentially releasing these references. We can add a fast path, avoiding these atomic operations, if we own the last reference on skb->head. Based on a previous patch from David Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c030cf8..2d1bc76 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -779,6 +779,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, u8 *data; int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; + bool fastpath; BUG_ON(nhead < 0); @@ -800,14 +801,28 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - if (skb_has_frag_list(skb)) - skb_clone_fraglist(skb); + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } - skb_release_data(skb); + if (fastpath) { + kfree(skb->head); + } else { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + skb_release_data(skb); + } off = (data + nhead) - skb->head; skb->head = data; -- cgit v1.1 From db40980fcdb560d7992b0511df16cdd3f7e381f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Sep 2010 11:13:50 +0000 Subject: net: poll() optimizations No need to test twice sk->sk_shutdown & RCV_SHUTDOWN Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 251997a..4df1b7a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -746,13 +746,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ -- cgit v1.1 From 6febfca98f25c7ee5c3ff7fc85e048bf82230ad5 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 3 Sep 2010 23:12:37 +0000 Subject: net: rps: add the shortcut for one rps_cpus When there is only one rps_cpus, skb_get_rxhash() can be eliminated. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index efd318d..cdbbea3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2343,7 +2343,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map; + struct rps_map *map = NULL; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2361,8 +2361,17 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + if (rxqueue->rps_map) { + map = rcu_dereference(rxqueue->rps_map); + if (map && map->len == 1) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rxqueue->rps_flow_table) { goto done; + } skb_reset_network_header(skb); if (!skb_get_rxhash(skb)) @@ -2407,7 +2416,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } } - map = rcu_dereference(rxqueue->rps_map); if (map) { tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; -- cgit v1.1 From a700d8be733bd593ea4797dfde17aed4f35213c0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 8 Sep 2010 03:48:47 +0000 Subject: net/core: remove address space warnings on verify_iovec() move_addr_to_kernel() and copy_from_user() requires their argument as __user pointer but were missing proper markups. Add it. This removes following warnings from sparse. net/core/iovec.c:44:52: warning: incorrect type in argument 1 (different address spaces) net/core/iovec.c:44:52: expected void [noderef] *uaddr net/core/iovec.c:44:52: got void *msg_name net/core/iovec.c:55:34: warning: incorrect type in argument 2 (different address spaces) net/core/iovec.c:55:34: expected void const [noderef] *from net/core/iovec.c:55:34: got struct iovec *msg_iov Signed-off-by: Namhyung Kim Signed-off-by: David S. Miller --- net/core/iovec.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index 1cd98df..f4657c2 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -41,7 +41,9 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, if (m->msg_namelen) { if (mode == VERIFY_READ) { - err = move_addr_to_kernel(m->msg_name, m->msg_namelen, + void __user *namep; + namep = (void __user __force *) m->msg_name; + err = move_addr_to_kernel(namep, m->msg_namelen, address); if (err < 0) return err; @@ -52,7 +54,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, } size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, m->msg_iov, size)) + if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) return -EFAULT; m->msg_iov = iov; -- cgit v1.1 From f39234d60617d37818b30991e6794643ce220296 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 8 Sep 2010 03:48:48 +0000 Subject: net/core: add lock context change annotations in net/core/sock.c __lock_sock() and __release_sock() releases and regrabs lock but were missing proper annotations. Add it. This removes following warning from sparse. (Currently __lock_sock() does not emit any warning about it but I think it is better to add also.) net/core/sock.c:1580:17: warning: context imbalance in '__release_sock' - unexpected unlock Signed-off-by: Namhyung Kim Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index b05b9b6..f3a06c4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1557,6 +1557,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, EXPORT_SYMBOL(sock_alloc_send_skb); static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); @@ -1573,6 +1575,8 @@ static void __lock_sock(struct sock *sk) } static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { struct sk_buff *skb = sk->sk_backlog.head; -- cgit v1.1 From 9ca7f8762299bb391c11a81c844224216e925b5c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 8 Sep 2010 09:16:28 +0000 Subject: pkt_sched: remov unnecessary bh_disable Now that est_tree_lock is acquired with BH protection, the other call is unnecessary. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 6743146..7c23733 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -274,9 +274,9 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, while ((e = gen_find_node(bstats, rate_est))) { rb_erase(&e->node, &est_root); - write_lock_bh(&est_lock); + write_lock(&est_lock); e->bstats = NULL; - write_unlock_bh(&est_lock); + write_unlock(&est_lock); list_del_rcu(&e->list); call_rcu(&e->e_rcu, __gen_kill_estimator); -- cgit v1.1 From 83b6b1f5d13414d0cb5c4f0a567a6aec0af073bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Sep 2010 07:00:25 +0000 Subject: flow: better memory management Allocate hash tables for every online cpus, not every possible ones. NUMA aware allocations. Dont use a full page on arches where PAGE_SIZE > 1024*sizeof(void *) misc: __percpu , __read_mostly, __cpuinit annotations flow_compare_t is just an "unsigned long" Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow.c | 78 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'net/core') diff --git a/net/core/flow.c b/net/core/flow.c index f67dcbf..b143b86 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -53,8 +53,7 @@ struct flow_flush_info { struct flow_cache { u32 hash_shift; - unsigned long order; - struct flow_cache_percpu *percpu; + struct flow_cache_percpu __percpu *percpu; struct notifier_block hotcpu_notifier; int low_watermark; int high_watermark; @@ -64,7 +63,7 @@ struct flow_cache { atomic_t flow_cache_genid = ATOMIC_INIT(0); EXPORT_SYMBOL(flow_cache_genid); static struct flow_cache flow_cache_global; -static struct kmem_cache *flow_cachep; +static struct kmem_cache *flow_cachep __read_mostly; static DEFINE_SPINLOCK(flow_cache_gc_lock); static LIST_HEAD(flow_cache_gc_list); @@ -181,11 +180,7 @@ static u32 flow_hash_code(struct flow_cache *fc, & (flow_cache_hash_size(fc) - 1)); } -#if (BITS_PER_LONG == 64) -typedef u64 flow_compare_t; -#else -typedef u32 flow_compare_t; -#endif +typedef unsigned long flow_compare_t; /* I hear what you're saying, use memcmp. But memcmp cannot make * important assumptions that we can here, such as alignment and @@ -357,62 +352,73 @@ void flow_cache_flush(void) put_online_cpus(); } -static void __init flow_cache_cpu_prepare(struct flow_cache *fc, - struct flow_cache_percpu *fcp) +static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { - fcp->hash_table = (struct hlist_head *) - __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); - if (!fcp->hash_table) - panic("NET: failed to allocate flow cache order %lu\n", fc->order); + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); - fcp->hash_rnd_recalc = 1; - fcp->hash_count = 0; - tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + if (!fcp->hash_table) { + fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); + if (!fcp->hash_table) { + pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + return -ENOMEM; + } + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + } + return 0; } -static int flow_cache_cpu(struct notifier_block *nfb, +static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) { struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); - int cpu = (unsigned long) hcpu; + int res, cpu = (unsigned long) hcpu; struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + res = flow_cache_cpu_prepare(fc, cpu); + if (res) + return notifier_from_errno(res); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: __flow_cache_shrink(fc, fcp, 0); + break; + } return NOTIFY_OK; } -static int flow_cache_init(struct flow_cache *fc) +static int __init flow_cache_init(struct flow_cache *fc) { - unsigned long order; int i; fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); fc->high_watermark = 4 * flow_cache_hash_size(fc); - for (order = 0; - (PAGE_SIZE << order) < - (sizeof(struct hlist_head)*flow_cache_hash_size(fc)); - order++) - /* NOTHING */; - fc->order = order; fc->percpu = alloc_percpu(struct flow_cache_percpu); + if (!fc->percpu) + return -ENOMEM; - setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, - (unsigned long) fc); - fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&fc->rnd_timer); - - for_each_possible_cpu(i) - flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i)); - + for_each_online_cpu(i) { + if (flow_cache_cpu_prepare(fc, i)) + return -ENOMEM; + } fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, }; register_hotcpu_notifier(&fc->hotcpu_notifier); + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, + (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); + return 0; } -- cgit v1.1 From e0de7c93b950b9e784894efc4b529c6958cb747a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 14 Sep 2010 09:13:08 +0000 Subject: ethtool: Remove unimplemented flow specification types struct ethtool_rawip4_spec and struct ethtool_ether_spec are neither commented nor used by any driver, so remove them. Adjust padding in the user-visible unions that included these structures. Fix references to struct ethtool_rawip4_spec in ethtool_get_rx_ntuple(), which should use struct ethtool_usrip4_spec. struct ethtool_usrip4_spec cannot hold IPv6 host addresses and there is no separate structure that can, so remove ETH_RX_NFC_IP6 and the reference to it in niu. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 970eb98..fcd62757 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -673,19 +673,19 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) break; case IP_USER_FLOW: sprintf(p, "\tSrc IP addr: 0x%x\n", - fsc->fs.h_u.raw_ip4_spec.ip4src); + fsc->fs.h_u.usr_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc IP mask: 0x%x\n", - fsc->fs.m_u.raw_ip4_spec.ip4src); + fsc->fs.m_u.usr_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP addr: 0x%x\n", - fsc->fs.h_u.raw_ip4_spec.ip4dst); + fsc->fs.h_u.usr_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP mask: 0x%x\n", - fsc->fs.m_u.raw_ip4_spec.ip4dst); + fsc->fs.m_u.usr_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; break; -- cgit v1.1 From 95ae6b228f814fc0528d0506ee9f18ac333d6851 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 04:04:31 +0000 Subject: ipv4: ip_ptr cleanups dev->ip_ptr is protected by rtnl and rcu. Yet some places dont use appropriate primitives and/or locking rules. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fc2dc93..5bdce97 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5286,7 +5286,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(atomic_read(&dev->refcnt)); - WARN_ON(dev->ip_ptr); + WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); -- cgit v1.1 From 16c3ea785fe4a383c6675dfe7316f3c815755bdd Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Wed, 15 Sep 2010 09:24:24 +0000 Subject: net: enable GRO by default for vlan devices Currently vlan devices don't have GRO by default as none of the Ethernet drivers add NETIF_F_GRO to their vlan_features. As GRO is a software feature add GRO to dev->vlan_features in register_netdevice() and let vlan_dev_init() take care that it gets enabled only when dev->features has NETIF_F_GRO too. Signed-off-by: Brandon Philips Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5bdce97..09b3742 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5057,6 +5057,11 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; + /* Enable GRO for vlans by default if dev->features has GRO also. + * vlan_dev_init() will do the dev->features check. + */ + dev->vlan_features |= NETIF_F_GRO; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) -- cgit v1.1 From caeda9b926c608702c99f3432aae2c24298c3c1d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 16 Sep 2010 21:39:16 -0700 Subject: net: include inetdevice.h for rcu_dereference_raw api change rcu_dereference_raw() now needs to know the type of its argument. Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 09b3742..c09ff09 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" -- cgit v1.1 From 67c9660831f6b6b76866a0838466c83765ffbbd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Sep 2010 11:56:18 -0700 Subject: ethtool: change ethtool_set_gro() to use ethtool_op_get_rx_csum To be able to switch on GRO on a device, ethtool_set_gro() checks this device provides a get_rx_csum() method. Some devices dont provide this method, while they do support RX checksumming. This patch allows bonding to support GRO : ethtool -K bond0 gro on Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index fcd62757..248c25c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1181,8 +1181,11 @@ static int ethtool_set_gro(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data) { - if (!dev->ethtool_ops->get_rx_csum || - !dev->ethtool_ops->get_rx_csum(dev)) + u32 rxcsum = dev->ethtool_ops->get_rx_csum ? + dev->ethtool_ops->get_rx_csum(dev) : + ethtool_op_get_rx_csum(dev); + + if (!rxcsum) return -EINVAL; dev->features |= NETIF_F_GRO; } else -- cgit v1.1 From 3b27e105550f7c4a79ecb6d6a9c49c651c59ae9b Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Fri, 17 Sep 2010 03:22:19 +0000 Subject: netns: keep vlan slaves on master netns move previously, if a vlan master device was moved from one network namespace to another, all 802.1q and macvlan slaves were deleted. we can use dev->reg_state to figure out whether dev_change_net_namespace is happening, since that won't set dev->reg_state NETREG_UNREGISTERING. so, this changes 8021q and macvlan to ignore NETDEV_UNREGISTER when reg_state is not NETREG_UNREGISTERING. Signed-off-by: David Lamparter Reviewed-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c09ff09..2c7934f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5686,6 +5686,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that we are about to destroy this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); -- cgit v1.1 From be2902daee80b655cebd482b5ee91ffc29408121 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 16 Sep 2010 11:28:07 +0000 Subject: ethtool, ixgbe: Move RX n-tuple mask fixup to ethtool The ethtool utility does not set masks for flow parameters that are not specified, so if both value and mask are 0 then this must be treated as equivalent to a mask with all bits set. Currently that is done in the only driver that implements RX n-tuple filtering, ixgbe. Move it to the ethtool core. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 248c25c..91ffce2 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -485,6 +485,38 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, list->count++; } +/* + * ethtool does not (or did not) set masks for flow parameters that are + * not specified, so if both value and mask are 0 then this must be + * treated as equivalent to a mask with all bits set. Implement that + * here rather than in drivers. + */ +static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec; + struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec; + + if (fs->flow_type != TCP_V4_FLOW && + fs->flow_type != UDP_V4_FLOW && + fs->flow_type != SCTP_V4_FLOW) + return; + + if (!(entry->ip4src | mask->ip4src)) + mask->ip4src = htonl(0xffffffff); + if (!(entry->ip4dst | mask->ip4dst)) + mask->ip4dst = htonl(0xffffffff); + if (!(entry->psrc | mask->psrc)) + mask->psrc = htons(0xffff); + if (!(entry->pdst | mask->pdst)) + mask->pdst = htons(0xffff); + if (!(entry->tos | mask->tos)) + mask->tos = 0xff; + if (!(fs->vlan_tag | fs->vlan_tag_mask)) + fs->vlan_tag_mask = 0xffff; + if (!(fs->data | fs->data_mask)) + fs->data_mask = 0xffffffffffffffffULL; +} + static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) { @@ -499,6 +531,8 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; + rx_ntuple_fix_masks(&cmd.fs); + /* * Cache filter in dev struct for GET operation only if * the underlying driver doesn't have its own GET operation, and -- cgit v1.1 From a77f5db361ed9953b5b749353ea2c7fed2bf8d93 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 20 Sep 2010 08:42:17 +0000 Subject: ethtool: Allocate register dump buffer with vmalloc() Some NICs have huge register files which exceed the maximum heap allocation size. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 91ffce2..dae2fd0 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -815,7 +815,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = kmalloc(reglen, GFP_USER); + regbuf = vmalloc(reglen); if (!regbuf) return -ENOMEM; @@ -830,7 +830,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) ret = 0; out: - kfree(regbuf); + vfree(regbuf); return ret; } -- cgit v1.1 From 73da16c28ed0724755afdb95c7dd6b166381be10 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 21 Sep 2010 16:12:11 -0700 Subject: ethtool: Fix build due to lack of ethtool.h include. net/core/ethtool.c: In function 'ethtool_get_regs': net/core/ethtool.c:818:2: error: implicit declaration of function 'vmalloc' net/core/ethtool.c:818:9: warning: assignment makes pointer from integer without a cast net/core/ethtool.c:833:2: error: implicit declaration of function 'vfree' Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index dae2fd0..7d7e572 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* -- cgit v1.1 From 82fd5b5d1ec370a50b3060418cde6a4ac8401117 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Sep 2010 20:40:26 +0000 Subject: net: core: use kernel's converter from hex to bin Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- net/core/pktgen.c | 10 ++++------ net/core/utils.c | 13 +++++++------ 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 386c228..2c0df0f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -729,16 +729,14 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, *num = 0; for (; i < maxlen; i++) { + int value; char c; *num <<= 4; if (get_user(c, &user_buffer[i])) return -EFAULT; - if ((c >= '0') && (c <= '9')) - *num |= c - '0'; - else if ((c >= 'a') && (c <= 'f')) - *num |= c - 'a' + 10; - else if ((c >= 'A') && (c <= 'F')) - *num |= c - 'A' + 10; + value = hex_to_bin(c); + if (value >= 0) + *num |= value; else break; } diff --git a/net/core/utils.c b/net/core/utils.c index f418544..ec6bb32 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -92,18 +92,19 @@ EXPORT_SYMBOL(in_aton); static inline int xdigit2bin(char c, int delim) { + int val; + if (c == delim || c == '\0') return IN6PTON_DELIM; if (c == ':') return IN6PTON_COLON_MASK; if (c == '.') return IN6PTON_DOT; - if (c >= '0' && c <= '9') - return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); - if (c >= 'a' && c <= 'f') - return (IN6PTON_XDIGIT | (c - 'a' + 10)); - if (c >= 'A' && c <= 'F') - return (IN6PTON_XDIGIT | (c - 'A' + 10)); + + val = hex_to_bin(c); + if (val >= 0) + return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); + if (delim == -1) return IN6PTON_DELIM; return IN6PTON_UNKNOWN; -- cgit v1.1 From a02cec2155fbea457eca8881870fd2de1a4c4c76 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 20:43:57 +0000 Subject: net: return operator cleanup Change "return (EXPR);" to "return EXPR;" return is not a function, parentheses are not required. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow.c | 4 ++-- net/core/neighbour.c | 6 +++--- net/core/utils.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/flow.c b/net/core/flow.c index b143b86..127c8a7 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -176,8 +176,8 @@ static u32 flow_hash_code(struct flow_cache *fc, { u32 *k = (u32 *) key; - return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) - & (flow_cache_hash_size(fc) - 1)); + return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1); } typedef unsigned long flow_compare_t; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a4e0a74..96b1a749a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return (base ? (net_random() % base) + (base >> 1) : 0); + return base ? (net_random() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -766,9 +766,9 @@ next_elt: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE ? + return (n->nud_state & NUD_PROBE) ? p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes); + p->ucast_probes + p->app_probes + p->mcast_probes; } static void neigh_invalidate(struct neighbour *neigh) diff --git a/net/core/utils.c b/net/core/utils.c index ec6bb32..5fea0ab 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -75,7 +75,7 @@ __be32 in_aton(const char *str) str++; } } - return(htonl(l)); + return htonl(l); } EXPORT_SYMBOL(in_aton); -- cgit v1.1 From c5256c51232d8312755e8de2b514c426b19b101a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 00:46:11 +0000 Subject: net: propagate NETIF_F_HIGHDMA to vlans Automatically allows vlans to get NETIF_F_HIGHDMA if underlying device supports it. On 32bit arches (and more precisely if CONFIG_HIGHMEM is enabled), it can help to reduce cost of illegal_highdma() and __skb_linearize() calls. Tested on tg3 , bnx2, bonding, this worked very well. This is a generalization of a patch provided by Yi Zou & Jeff Kirsher. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2c7934f..e0c0b86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5058,10 +5058,11 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; - /* Enable GRO for vlans by default if dev->features has GRO also. - * vlan_dev_init() will do the dev->features check. + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, + * vlan_dev_init() will do the dev->features check, so these features + * are enabled only if supported by underlying device. */ - dev->vlan_features |= NETIF_F_GRO; + dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); -- cgit v1.1 From 1b4bf461f05d56ced6d6b8f3b4831adc7076f565 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 17:26:35 +0000 Subject: rps: allocate rx queues in register_netdevice only Instead of having two places were we allocate dev->_rx, introduce netif_alloc_rx_queues() helper and call it only from register_netdevice(), not from alloc_netdev_mq() Goal is to let drivers change dev->num_rx_queues after allocating netdev and before registering it. This also removes a lot of ifdefs in net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 76 +++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 44 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e0c0b86..72e9983 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4964,6 +4964,34 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +static int netif_alloc_rx_queues(struct net_device *dev) +{ +#ifdef CONFIG_RPS + unsigned int i, count = dev->num_rx_queues; + + if (count) { + struct netdev_rx_queue *rx; + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", + count); + return -ENOMEM; + } + dev->_rx = rx; + atomic_set(&rx->count, count); + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < count; i++) + rx[i].first = rx; + } +#endif + return 0; +} + /** * register_netdevice - register a network device * @dev: device to register @@ -5001,24 +5029,10 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; -#ifdef CONFIG_RPS - if (!dev->num_rx_queues) { - /* - * Allocate a single RX queue if driver never called - * alloc_netdev_mq - */ - - dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!dev->_rx) { - ret = -ENOMEM; - goto out; - } + ret = netif_alloc_rx_queues(dev); + if (ret) + goto out; - dev->_rx->first = dev->_rx; - atomic_set(&dev->_rx->count, 1); - dev->num_rx_queues = 1; - } -#endif /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5415,10 +5429,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, struct net_device *dev; size_t alloc_size; struct net_device *p; -#ifdef CONFIG_RPS - struct netdev_rx_queue *rx; - int i; -#endif BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5444,29 +5454,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } -#ifdef CONFIG_RPS - rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "rx queues.\n"); - goto free_tx; - } - - atomic_set(&rx->count, queue_count); - - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ - for (i = 0; i < queue_count; i++) - rx[i].first = rx; -#endif dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) - goto free_rx; + goto free_tx; dev_mc_init(dev); dev_uc_init(dev); @@ -5478,7 +5471,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->real_num_tx_queues = queue_count; #ifdef CONFIG_RPS - dev->_rx = rx; dev->num_rx_queues = queue_count; #endif @@ -5496,11 +5488,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; -free_rx: -#ifdef CONFIG_RPS - kfree(rx); free_tx: -#endif kfree(tx); free_p: kfree(p); -- cgit v1.1 From 7fa7cb7109d07c29ab28bb877bc7049a0150dbe5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 04:18:27 +0000 Subject: fib: use atomic_inc_not_zero() in fib_rules_lookup It seems we dont use appropriate refcount increment in an rcu_read_lock() protected section. fib_rule_get() might increment a null refcount and bad things could happen. While fib_nl_delrule() respects an rcu grace period before calling fib_rule_put(), fib_rules_cleanup_ops() calls fib_rule_put() without a grace period. Note : after this patch, we might avoid the synchronize_rcu() call done in fib_nl_delrule() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 42e84e0..d078728 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -225,9 +225,11 @@ jumped: err = ops->action(rule, fl, flags, arg); if (err != -EAGAIN) { - fib_rule_get(rule); - arg->rule = rule; - goto out; + if (likely(atomic_inc_not_zero(&rule->refcnt))) { + arg->rule = rule; + goto out; + } + break; } } -- cgit v1.1 From f91ff5b9ff529be8aac2039af63b2c8ea6cd6ebe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 06:07:30 +0000 Subject: net: sk_{detach|attach}_filter() rcu fixes sk_attach_filter() and sk_detach_filter() are run with socket locked. Use the appropriate rcu_dereference_protected() instead of blocking BH, and rcu_dereference_bh(). There is no point adding BH prevention and memory barrier. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 52b051f..7adf503 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -638,10 +638,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return err; } - rcu_read_lock_bh(); - old_fp = rcu_dereference_bh(sk->sk_filter); + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); - rcu_read_unlock_bh(); if (old_fp) sk_filter_delayed_uncharge(sk, old_fp); @@ -654,14 +653,13 @@ int sk_detach_filter(struct sock *sk) int ret = -ENOENT; struct sk_filter *filter; - rcu_read_lock_bh(); - filter = rcu_dereference_bh(sk->sk_filter); + filter = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); sk_filter_delayed_uncharge(sk, filter); ret = 0; } - rcu_read_unlock_bh(); return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); -- cgit v1.1 From 62fe0b40abb3484413800edaef9b087a20059acf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Sep 2010 08:24:33 +0000 Subject: net: Allow changing number of RX queues after device allocation For RPS, we create a kobject for each RX queue based on the number of queues passed to alloc_netdev_mq(). However, drivers generally do not determine the numbers of hardware queues to use until much later, so this usually represents the maximum number the driver may use and not the actual number in use. For TX queues, drivers can update the actual number using netif_set_real_num_tx_queues(). Add a corresponding function for RX queues, netif_set_real_num_rx_queues(). Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++---- net/core/net-sysfs.c | 32 ++++++++++++++++++-------------- net/core/net-sysfs.h | 4 ++++ 3 files changed, 63 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 42b200f..48ad47f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1567,6 +1567,41 @@ void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); +#ifdef CONFIG_RPS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it also + * sets the maximum number of queues, and always succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + if (rxq > dev->num_rx_queues) + return -EINVAL; + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } else { + dev->num_rx_queues = rxq; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; @@ -2352,10 +2387,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue = dev->_rx + index; @@ -5472,6 +5508,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, #ifdef CONFIG_RPS dev->num_rx_queues = queue_count; + dev->real_num_rx_queues = queue_count; #endif dev->gso_max_size = GSO_MAX_SIZE; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 76485a3..fa81fd0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -742,34 +742,38 @@ static int rx_queue_add_kobject(struct net_device *net, int index) return error; } -static int rx_queue_register_kobjects(struct net_device *net) +int +net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { int i; int error = 0; - net->queues_kset = kset_create_and_add("queues", - NULL, &net->dev.kobj); - if (!net->queues_kset) - return -ENOMEM; - for (i = 0; i < net->num_rx_queues; i++) { + for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(net, i); - if (error) + if (error) { + new_num = old_num; break; + } } - if (error) - while (--i >= 0) - kobject_put(&net->_rx[i].kobj); + while (--i >= new_num) + kobject_put(&net->_rx[i].kobj); return error; } -static void rx_queue_remove_kobjects(struct net_device *net) +static int rx_queue_register_kobjects(struct net_device *net) { - int i; + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; + return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); +} - for (i = 0; i < net->num_rx_queues; i++) - kobject_put(&net->_rx[i].kobj); +static void rx_queue_remove_kobjects(struct net_device *net) +{ + net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0); kset_unregister(net->queues_kset); } #endif /* CONFIG_RPS */ diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 805555e..778e157 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -4,4 +4,8 @@ int netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); +#ifdef CONFIG_RPS +int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +#endif + #endif -- cgit v1.1 From 4465b469008bc03b98a1b8df4e9ae501b6c69d4b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 23 May 2010 19:54:12 +0000 Subject: ipv4: Allow configuring subnets as local addresses This patch allows a host to be configured to respond to any address in a specified range as if it were local, without actually needing to configure the address on an interface. This is done through routing table configuration. For instance, to configure a host to respond to any address in 10.1/16 received on eth0 as a local address we can do: ip rule add from all iif eth0 lookup 200 ip route add local 10.1/16 dev lo proto kernel scope host src 127.0.0.1 table 200 This host is now reachable by any 10.1/16 address (route lookup on input for packets received on eth0 can find the route). On output, the rule will not be matched so that this host can still send packets to 10.1/16 (not sent on loopback). Presumably, external routing can be configured to make sense out of this. To make this work, we needed to modify the logic in finding the interface which is assigned a given source address for output (dev_ip_find). We perform a normal fib_lookup instead of just a lookup on the local table, and in the lookup we ignore the input interface for matching. This patch is useful to implement IP-anycast for subnets of virtual addresses. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d078728..332c2e3 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -182,7 +182,8 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->iifindex && (rule->iifindex != fl->iif)) + if (rule->iifindex && (rule->iifindex != fl->iif) && + !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF)) goto out; if (rule->oifindex && (rule->oifindex != fl->oif)) -- cgit v1.1 From 745e20f1b626b1be4b100af5d4bf7b3439392f8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2010 13:23:09 -0700 Subject: net: add a recursion limit in xmit path As tunnel devices are going to be lockless, we need to make sure a misconfigured machine wont enter an infinite loop. Add a percpu variable, and limit to three the number of stacked xmits. Reported-by: Jesse Gross Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 48ad47f..50dacca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,6 +2177,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 3 + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2242,10 +2245,15 @@ int dev_queue_xmit(struct sk_buff *skb) if (txq->xmit_lock_owner != cpu) { + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { + __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -2257,7 +2265,9 @@ int dev_queue_xmit(struct sk_buff *skb) "queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, - * unfortunately */ + * unfortunately + */ +recursion_alert: if (net_ratelimit()) printk(KERN_CRIT "Dead loop on virtual device " "%s, fix it urgently!\n", dev->name); -- cgit v1.1 From bfa5ae63b823f4ffd3483a05f60a93a4a7b7d680 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 05:58:37 +0000 Subject: net: rename netdev rx_queue to ingress_queue There is some confusion with rx_queue name after RPS, and net drivers private rx_queue fields. I suggest to rename "struct net_device"->rx_queue to ingress_queue. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 50dacca..a313bab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2720,7 +2720,7 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->rx_queue; + rxq = &dev->ingress_queue; q = rxq->qdisc; if (q != &noop_qdisc) { @@ -2737,7 +2737,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->rx_queue.qdisc == &noop_qdisc) + if (skb->dev->ingress_queue.qdisc == &noop_qdisc) goto out; if (*pt_prev) { @@ -4940,7 +4940,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, static void netdev_init_queue_locks(struct net_device *dev) { netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); + __netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -5452,7 +5452,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->rx_queue, NULL); + netdev_init_one_queue(dev, &dev->ingress_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); } -- cgit v1.1 From c7d4426a98a5f6654cd0b4b33d9dab2e77192c18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Oct 2010 22:17:54 -0700 Subject: net: introduce DST_NOCACHE flag While doing stress tests with IP route cache disabled, and multi queue devices, I noticed a very high contention on one rwlock used in neighbour code. When many cpus are trying to send frames (possibly using a high performance multiqueue device) to the same neighbour, they fight for the neigh->lock rwlock in order to call neigh_hh_init(), and fight on hh->hh_refcnt (a pair of atomic_inc/atomic_dec_and_test()) But we dont need to call neigh_hh_init() for dst that are used only once. It costs four atomic operations at least, on two contended cache lines, plus the high contention on neigh->lock rwlock. Introduce a new dst flag, DST_NOCACHE, that is set when dst was not inserted in route cache. With the stress test bench, sending 160000000 frames on one neighbour, results are : Before patch: real 2m28.406s user 0m11.781s sys 36m17.964s After patch: real 1m26.532s user 0m12.185s sys 20m3.903s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 96b1a749a..b142a0d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1210,7 +1210,9 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; - if (dev->header_ops->cache && !dst->hh) { + if (dev->header_ops->cache && + !dst->hh && + !(dst->flags & DST_NOCACHE)) { write_lock_bh(&neigh->lock); if (!dst->hh) neigh_hh_init(neigh, dst, dst->ops->protocol); -- cgit v1.1 From 24824a09e35402b8d58dcc5be803a5ad3937bdba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 Oct 2010 06:11:55 +0000 Subject: net: dynamic ingress_queue allocation ingress being not used very much, and net_device->ingress_queue being quite a big object (128 or 256 bytes), use a dynamic allocation if needed (tc qdisc add dev eth0 ingress ...) dev_ingress_queue(dev) helper should be used only with RTNL taken. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a313bab..ce6ad88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); * the ingress scheduler, you just cant add policies on ingress. * */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) { struct net_device *dev = skb->dev; u32 ttl = G_TC_RTTL(skb->tc_verd); - struct netdev_queue *rxq; int result = TC_ACT_OK; struct Qdisc *q; @@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->ingress_queue; - q = rxq->qdisc; if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->ingress_queue.qdisc == &noop_qdisc) + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) goto out; if (*pt_prev) { @@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, *pt_prev = NULL; } - switch (ing_filter(skb)) { + switch (ing_filter(skb, rxq)) { case TC_ACT_SHOT: case TC_ACT_STOLEN: kfree_skb(skb); @@ -4940,7 +4939,6 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, static void netdev_init_queue_locks(struct net_device *dev) { netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -5452,11 +5450,29 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->ingress_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); } +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + __netdev_init_queue_locks_one(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + /** * alloc_netdev_mq - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -5559,6 +5575,8 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); + kfree(rcu_dereference_raw(dev->ingress_queue)); + /* Flush device addresses */ dev_addr_flush(dev); -- cgit v1.1 From 1df9916e46451533463f227e6be57cc2cfca4c5f Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 4 Oct 2010 20:14:17 +0000 Subject: fib: fib_rules_cleanup can be static fib_rules_cleanup_ups is only defined and used in one place. Signed-off-by: Stephen Hemminger Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 332c2e3..cfb7d25 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) } EXPORT_SYMBOL_GPL(fib_rules_register); -void fib_rules_cleanup_ops(struct fib_rules_ops *ops) +static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; @@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops) fib_rule_put(rule); } } -EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops); static void fib_rules_put_rcu(struct rcu_head *head) { -- cgit v1.1 From caf586e5f23cebb2a68cbaf288d59dbbf2d74052 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 21:06:55 +0000 Subject: net: add a core netdev->rx_dropped counter In various situations, a device provides a packet to our stack and we drop it before it enters protocol stack : - softnet backlog full (accounted in /proc/net/softnet_stat) - bad vlan tag (not accounted) - unknown/unregistered protocol (not accounted) We can handle a per-device counter of such dropped frames at core level, and automatically adds it to the device provided stats (rx_dropped), so that standard tools can be used (ifconfig, ip link, cat /proc/net/dev) This is a generalization of commit 8990f468a (net: rx_dropped accounting), thus reverting it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ce6ad88..7d14955 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1483,8 +1483,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + if (unlikely(!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len)))) { + atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2548,6 +2549,7 @@ enqueue: local_irq_restore(flags); + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2995,6 +2997,7 @@ ncls: if (pt_prev) { ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -5429,14 +5432,14 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); - return ops->ndo_get_stats64(dev, storage); - } - if (ops->ndo_get_stats) { + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); - return storage; + } else { + netdev_stats_to_stats64(storage, &dev->stats); + dev_txq_stats_fold(dev, storage); } - netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); -- cgit v1.1 From 110b2499370c401cdcc7c63e481084467291d556 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 04:27:36 +0000 Subject: net neigh: neigh_delete() and neigh_add() changes neigh_delete() and neigh_add() dont need to touch device refcount, we hold RTNL when calling them, so device cannot disappear under us. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b142a0d..d6996e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1531,6 +1531,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err = -EINVAL; + ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; @@ -1540,7 +1541,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; @@ -1556,34 +1557,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(dst_attr) < tbl->key_len) - goto out_dev_put; + goto out; if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; - goto out_dev_put; + goto out; } err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; -out_dev_put: - if (dev) - dev_put(dev); out: return err; } @@ -1597,6 +1595,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err; + ASSERT_RTNL(); err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) goto out; @@ -1607,14 +1606,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) - goto out_dev_put; + goto out; } read_lock(&neigh_tbl_lock); @@ -1628,7 +1627,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out_dev_put; + goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1641,29 +1640,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) pn->flags = ndm->ndm_flags; err = 0; } - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; - goto out_dev_put; + goto out; } neigh = __neigh_lookup_errno(tbl, dst, dev); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); - goto out_dev_put; + goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); - goto out_dev_put; + goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) @@ -1676,15 +1675,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } else err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; - -out_dev_put: - if (dev) - dev_put(dev); out: return err; } -- cgit v1.1 From d6bf781712a1d25cc8987036b3a48535b331eb91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 06:15:44 +0000 Subject: net neigh: RCU conversion of neigh hash table David This is the first step for RCU conversion of neigh code. Next patches will convert hash_buckets[] and "struct neighbour" to RCU protected objects. Thanks [PATCH net-next] net neigh: RCU conversion of neigh hash table Instead of storing hash_buckets, hash_mask and hash_rnd in "struct neigh_table", a new structure is defined : struct neigh_hash_table { struct neighbour **hash_buckets; unsigned int hash_mask; __u32 hash_rnd; struct rcu_head rcu; }; And "struct neigh_table" has an RCU protected pointer to such a neigh_hash_table. This means the signature of (*hash)() function changed: We need to add a third parameter with the actual hash_rnd value, since this is not anymore a neigh_table field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 219 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 137 insertions(+), 82 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d6996e0..dd8920e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,14 +131,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - for (i = 0; i <= tbl->hash_mask; i++) { + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i <= nht->hash_mask; i++) { struct neighbour *n, **np; - np = &tbl->hash_buckets[i]; + np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. @@ -199,9 +202,13 @@ static void pneigh_queue_purge(struct sk_buff_head *list) static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) { int i; + struct neigh_hash_table *nht; - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np = &tbl->hash_buckets[i]; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n, **np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { if (dev && n->dev != dev) { @@ -297,64 +304,81 @@ out_entries: goto out; } -static struct neighbour **neigh_hash_alloc(unsigned int entries) +static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) { - unsigned long size = entries * sizeof(struct neighbour *); - struct neighbour **ret; + size_t size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour **buckets; - if (size <= PAGE_SIZE) { - ret = kzalloc(size, GFP_ATOMIC); - } else { - ret = (struct neighbour **) - __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; } + ret->hash_buckets = buckets; + ret->hash_mask = entries - 1; + get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; } -static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +static void neigh_hash_free_rcu(struct rcu_head *head) { - unsigned long size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); + struct neighbour **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) - kfree(hash); + kfree(buckets); else - free_pages((unsigned long)hash, get_order(size)); + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); } -static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_entries) { - struct neighbour **new_hash, **old_hash; - unsigned int i, new_hash_mask, old_entries; + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); BUG_ON(!is_power_of_2(new_entries)); - new_hash = neigh_hash_alloc(new_entries); - if (!new_hash) - return; + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_entries); + if (!new_nht) + return old_nht; - old_entries = tbl->hash_mask + 1; - new_hash_mask = new_entries - 1; - old_hash = tbl->hash_buckets; - - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - for (i = 0; i < old_entries; i++) { + for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_hash[i]; n; n = next) { - unsigned int hash_val = tbl->hash(n->primary_key, n->dev); + for (n = old_nht->hash_buckets[i]; + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); - hash_val &= new_hash_mask; + hash &= new_nht->hash_mask; next = n->next; - n->next = new_hash[hash_val]; - new_hash[hash_val] = n; + n->next = new_nht->hash_buckets[hash]; + new_nht->hash_buckets[hash] = n; } } - tbl->hash_buckets = new_hash; - tbl->hash_mask = new_hash_mask; - neigh_hash_free(old_hash, old_entries); + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, @@ -363,19 +387,23 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, dev); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; + read_lock(&tbl->lock); + for (n = nht->hash_buckets[hash_val]; n; n = n->next) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup); @@ -386,12 +414,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, NULL); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; + read_lock(&tbl->lock); + for (n = nht->hash_buckets[hash_val]; n; n = n->next) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { neigh_hold(n); @@ -399,7 +430,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, break; } } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup_nodev); @@ -411,6 +443,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); @@ -437,18 +470,20 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (n->parms->base_reachable_time << 1); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) - neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + if (atomic_read(&tbl->entries) > (nht->hash_mask + 1)) + nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1); - hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } - for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -456,8 +491,8 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = tbl->hash_buckets[hash_val]; - tbl->hash_buckets[hash_val] = n; + n->next = nht->hash_buckets[hash_val]; + nht->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); write_unlock_bh(&tbl->lock); @@ -698,10 +733,13 @@ static void neigh_periodic_work(struct work_struct *work) struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neighbour *n, **np; unsigned int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function @@ -715,8 +753,8 @@ static void neigh_periodic_work(struct work_struct *work) neigh_rand_reach_time(p->base_reachable_time); } - for (i = 0 ; i <= tbl->hash_mask; i++) { - np = &tbl->hash_buckets[i]; + for (i = 0 ; i <= nht->hash_mask; i++) { + np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { unsigned int state; @@ -1438,17 +1476,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour proc dir entry"); #endif - tbl->hash_mask = 1; - tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + tbl->nht = neigh_hash_alloc(8); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); - if (!tbl->hash_buckets || !tbl->phash_buckets) + if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - rwlock_init(&tbl->lock); INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); @@ -1504,8 +1539,8 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); - tbl->hash_buckets = NULL; + call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); + tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; @@ -1745,18 +1780,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, unsigned long now = jiffies; unsigned int flush_delta = now - tbl->last_flush; unsigned int rand_delta = now - tbl->last_rand; - + struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_hash_rnd = tbl->hash_rnd, - .ndtc_hash_mask = tbl->hash_mask, .ndtc_proxy_qlen = tbl->proxy_queue.qlen, }; + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd; + ndc.ndtc_hash_mask = nht->hash_mask; + rcu_read_unlock_bh(); + NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); } @@ -2088,14 +2127,18 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (h = 0; h <= tbl->hash_mask; h++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); + for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { + for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) { if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) @@ -2104,7 +2147,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { - read_unlock_bh(&tbl->lock); rc = -1; goto out; } @@ -2112,9 +2154,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, idx++; } } - read_unlock_bh(&tbl->lock); rc = skb->len; out: + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; return rc; @@ -2147,15 +2190,20 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (chain = 0; chain <= tbl->hash_mask; chain++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = tbl->hash_buckets[chain]; n; n = n->next) + for (n = nht->hash_buckets[chain]; n; n = n->next) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_for_each); @@ -2164,11 +2212,14 @@ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { int chain; + struct neigh_hash_table *nht; - for (chain = 0; chain <= tbl->hash_mask; chain++) { + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n, **np; - np = &tbl->hash_buckets[chain]; + np = &nht->hash_buckets[chain]; while ((n = *np) != NULL) { int release; @@ -2193,13 +2244,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; int bucket = state->bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { - n = tbl->hash_buckets[bucket]; + for (bucket = 0; bucket <= nht->hash_mask; bucket++) { + n = nht->hash_buckets[bucket]; while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2234,7 +2285,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); @@ -2265,10 +2316,10 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n) break; - if (++state->bucket > tbl->hash_mask) + if (++state->bucket > nht->hash_mask) break; - n = tbl->hash_buckets[state->bucket]; + n = nht->hash_buckets[state->bucket]; } if (n && pos) @@ -2367,6 +2418,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) + __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2374,8 +2426,9 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl state->bucket = 0; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - read_lock_bh(&tbl->lock); - + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); + read_lock(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2409,11 +2462,13 @@ EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) + __releases(rcu_bh) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); -- cgit v1.1 From ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 10:41:36 +0000 Subject: fib: RCU conversion of fib_lookup() fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index cfb7d25..21698f8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -225,7 +225,8 @@ jumped: err = ops->action(rule, fl, flags, arg); if (err != -EAGAIN) { - if (likely(atomic_inc_not_zero(&rule->refcnt))) { + if ((arg->flags & FIB_LOOKUP_NOREF) || + likely(atomic_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } -- cgit v1.1 From 767e97e1e0db0d0f3152cd2f3bd3403596aedbad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Oct 2010 17:49:21 -0700 Subject: neigh: RCU conversion of struct neighbour This is the second step for neighbour RCU conversion. (first was commit d6bf7817 : RCU conversion of neigh hash table) neigh_lookup() becomes lockless, but still take a reference on found neighbour. (no more read_lock()/read_unlock() on tbl->lock) struct neighbour gets an additional rcu_head field and is freed after an RCU grace period. Future work would need to eventually not take a reference on neighbour for temporary dst (DST_NOCACHE), but this would need dst->_neighbour to use a noref bit like we did for skb->_dst. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 137 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 52 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dd8920e..3ffafaa0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -139,10 +139,12 @@ static int neigh_forced_gc(struct neigh_table *tbl) nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i <= nht->hash_mask; i++) { - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. * - it is not permanent @@ -150,7 +152,9 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if (atomic_read(&n->refcnt) == 1 && !(n->nud_state & NUD_PERMANENT)) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; shrunk = 1; write_unlock(&n->lock); @@ -208,14 +212,18 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) lockdep_is_held(&tbl->lock)); for (i = 0; i <= nht->hash_mask; i++) { - struct neighbour *n, **np = &nht->hash_buckets[i]; + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { if (dev && n->dev != dev) { np = &n->next; continue; } - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); n->dead = 1; @@ -323,7 +331,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) kfree(ret); return NULL; } - ret->hash_buckets = buckets; + rcu_assign_pointer(ret->hash_buckets, buckets); ret->hash_mask = entries - 1; get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; @@ -362,17 +370,22 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_nht->hash_buckets[i]; + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); n != NULL; n = next) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash &= new_nht->hash_mask; - next = n->next; - - n->next = new_nht->hash_buckets[hash]; - new_nht->hash_buckets[hash] = n; + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } @@ -394,15 +407,18 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; - read_lock(&tbl->lock); - for (n = nht->hash_buckets[hash_val]; n; n = n->next) { + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } @@ -421,16 +437,19 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; - read_lock(&tbl->lock); - for (n = nht->hash_buckets[hash_val]; n; n = n->next) { + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } @@ -483,7 +502,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -491,10 +514,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = nht->hash_buckets[hash_val]; - nht->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); NEIGH_PRINTK2("neigh %p is created.\n", n); rc = n; @@ -651,6 +676,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms) neigh_parms_destroy(parms); } +static void neigh_destroy_rcu(struct rcu_head *head) +{ + struct neighbour *neigh = container_of(head, struct neighbour, rcu); + + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} /* * neighbour must already be out of the table; * @@ -690,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh) NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); atomic_dec(&neigh->tbl->entries); - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); + call_rcu(&neigh->rcu, neigh_destroy_rcu); } EXPORT_SYMBOL(neigh_destroy); @@ -731,7 +762,8 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; unsigned int i; struct neigh_hash_table *nht; @@ -756,7 +788,8 @@ static void neigh_periodic_work(struct work_struct *work) for (i = 0 ; i <= nht->hash_mask; i++) { np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { unsigned int state; write_lock(&n->lock); @@ -1213,8 +1246,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, } /* This function can be used in contexts, where only old dev_queue_xmit - worked, f.e. if you want to override normal output path (eql, shaper), - but resolution is not made yet. + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. */ int neigh_compat_output(struct sk_buff *skb) @@ -2123,7 +2156,7 @@ static void neigh_update_notify(struct neighbour *neigh) static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { - struct net * net = sock_net(skb->sk); + struct net *net = sock_net(skb->sk); struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; @@ -2132,13 +2165,14 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) @@ -2150,13 +2184,12 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rc = -1; goto out; } - next: +next: idx++; } } rc = skb->len; out: - read_unlock(&tbl->lock); rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; @@ -2195,11 +2228,13 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + read_lock(&tbl->lock); /* avoid resizes */ for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = nht->hash_buckets[chain]; n; n = n->next) + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) cb(n, cookie); } read_unlock(&tbl->lock); @@ -2217,16 +2252,20 @@ void __neigh_for_each_release(struct neigh_table *tbl, nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain <= nht->hash_mask; chain++) { - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; np = &nht->hash_buckets[chain]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { int release; write_lock(&n->lock); release = cb(n); if (release) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; } else np = &n->next; @@ -2250,7 +2289,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) state->flags &= ~NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= nht->hash_mask; bucket++) { - n = nht->hash_buckets[bucket]; + n = rcu_dereference_bh(nht->hash_buckets[bucket]); while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2267,8 +2306,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) break; if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2292,7 +2331,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (v) return n; } - n = n->next; + n = rcu_dereference_bh(n->next); while (1) { while (n) { @@ -2309,8 +2348,8 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2319,7 +2358,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (++state->bucket > nht->hash_mask) break; - n = nht->hash_buckets[state->bucket]; + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } if (n && pos) @@ -2417,7 +2456,6 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) - __acquires(tbl->lock) __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2428,7 +2466,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock_bh(); state->nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2461,13 +2499,8 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) - __releases(tbl->lock) __releases(rcu_bh) { - struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; - - read_unlock(&tbl->lock); rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); -- cgit v1.1 From 3d3211ef5cf7558d289d1a1efbef8c45595639aa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 6 Oct 2010 23:35:15 -0700 Subject: net: netif_set_real_num_rx_queues may cap num_rx_queues at init time Do not set num_rx_queues in netif_set_real_num_rx_queues() some drivers will increase the real_num_rx_queues later due to a feature changes or available interrupts increasing. By setting num_rx_queues here this ends up creating a cap on the number of rx queues available. For example the ixgbe driver sets the max number of queues it intends to use ever then sets the current number in use with the netif_set_num_{rx|tx}_queues calls. With the current implementation the number of rx queues gets limited so when a feature such as DCB or FCoE is enabled the queues are no longer available. kobjects will only be allocated for real_num_rx_queues so the waste in memory is minimal. Signed-off-by: John Fastabend Signed-off-by: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7d14955..fd1b75a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1593,8 +1593,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) rxq); if (rc) return rc; - } else { - dev->num_rx_queues = rxq; } dev->real_num_rx_queues = rxq; -- cgit v1.1 From 4e7f79511e7332ae4056eda9156a0299511ea41e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 8 Oct 2010 10:33:39 -0700 Subject: net: Update kernel-doc for netif_set_real_num_rx_queues() Synchronise the comment with the preceding implementation change. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fd1b75a..4962c8a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1576,8 +1576,8 @@ EXPORT_SYMBOL(netif_set_real_num_tx_queues); * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a - * negative error code. If called before registration, it also - * sets the maximum number of queues, and always succeeds. + * negative error code. If called before registration, it always + * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { -- cgit v1.1 From 4315d834c1496ddca977e9e22002b77c85bfec2c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 7 Oct 2010 10:09:10 +0000 Subject: net: Fix rxq ref counting The rx->count reference is used to track reference counts to the number of rx-queue kobjects created for the device. This patch eliminates initialization of the counter in netif_alloc_rx_queues and instead increments the counter each time a kobject is created. This is now symmetric with the decrement that is done when an object is released. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - net/core/net-sysfs.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4962c8a..193eafa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5024,7 +5024,6 @@ static int netif_alloc_rx_queues(struct net_device *dev) return -ENOMEM; } dev->_rx = rx; - atomic_set(&rx->count, count); /* * Set a pointer to first element in the array which holds the diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fa81fd0..b143173 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -726,6 +726,7 @@ static struct kobj_type rx_queue_ktype = { static int rx_queue_add_kobject(struct net_device *net, int index) { struct netdev_rx_queue *queue = net->_rx + index; + struct netdev_rx_queue *first = queue->first; struct kobject *kobj = &queue->kobj; int error = 0; @@ -738,6 +739,7 @@ static int rx_queue_add_kobject(struct net_device *net, int index) } kobject_uevent(kobj, KOBJ_ADD); + atomic_inc(&first->count); return error; } -- cgit v1.1 From 34d101dd6204bd100fc2e6f7b5f9a10f959ce2c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 09:16:57 -0700 Subject: neigh: speedup neigh_hh_init() When a new dst is used to send a frame, neigh_resolve_output() tries to associate an struct hh_cache to this dst, calling neigh_hh_init() with the neigh rwlock write locked. Most of the time, hh_cache is already known and linked into neighbour, so we find it and increment its refcount. This patch changes the logic so that we call neigh_hh_init() with neighbour lock read locked only, so that fast path can be run in parallel by concurrent cpus. This brings part of the speedup we got with commit c7d4426a98a5f (introduce DST_NOCACHE flag) for non cached dsts, even for cached ones, removing one of the contention point that routers hit on multiqueue enabled machines. Further improvements would need to use a seqlock instead of an rwlock to protect neigh->ha[], to not dirty neigh too often and remove two atomic ops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 4 +-- net/core/neighbour.c | 99 ++++++++++++++++++++++++++++++++-------------------- 2 files changed, 63 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 6c41b1f..978a1ee 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -228,8 +228,8 @@ again: child = dst->child; dst->hh = NULL; - if (hh && atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + if (hh) + hh_cache_put(hh); if (neigh) { dst->neighbour = NULL; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3ffafaa0..2044906 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -709,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh) write_seqlock_bh(&hh->hh_lock); hh->hh_output = neigh_blackhole; write_sequnlock_bh(&hh->hh_lock); - if (atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + hh_cache_put(hh); } skb_queue_purge(&neigh->arp_queue); @@ -1210,39 +1209,67 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, } EXPORT_SYMBOL(neigh_event_ns); +static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, + __be16 protocol) +{ + struct hh_cache *hh; + + for (hh = n->hh; hh; hh = hh->hh_next) { + if (hh->hh_type == protocol) { + atomic_inc(&hh->hh_refcnt); + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); + return true; + } + } + return false; +} + +/* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, __be16 protocol) { struct hh_cache *hh; struct net_device *dev = dst->dev; - for (hh = n->hh; hh; hh = hh->hh_next) - if (hh->hh_type == protocol) - break; + if (likely(neigh_hh_lookup(n, dst, protocol))) + return; - if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { - seqlock_init(&hh->hh_lock); - hh->hh_type = protocol; - atomic_set(&hh->hh_refcnt, 0); - hh->hh_next = NULL; + /* slow path */ + hh = kzalloc(sizeof(*hh), GFP_ATOMIC); + if (!hh) + return; - if (dev->header_ops->cache(n, hh)) { - kfree(hh); - hh = NULL; - } else { - atomic_inc(&hh->hh_refcnt); - hh->hh_next = n->hh; - n->hh = hh; - if (n->nud_state & NUD_CONNECTED) - hh->hh_output = n->ops->hh_output; - else - hh->hh_output = n->ops->output; - } + seqlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 2); + + if (dev->header_ops->cache(n, hh)) { + kfree(hh); + return; } - if (hh) { - atomic_inc(&hh->hh_refcnt); - dst->hh = hh; + read_unlock(&n->lock); + write_lock(&n->lock); + + /* must check if another thread already did the insert */ + if (neigh_hh_lookup(n, dst, protocol)) { + kfree(hh); + goto end; } + + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + + hh->hh_next = n->hh; + n->hh = hh; + + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); +end: + write_unlock(&n->lock); + read_lock(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit @@ -1281,21 +1308,17 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; + + read_lock_bh(&neigh->lock); if (dev->header_ops->cache && !dst->hh && - !(dst->flags & DST_NOCACHE)) { - write_lock_bh(&neigh->lock); - if (!dst->hh) - neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - write_unlock_bh(&neigh->lock); - } else { - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); - } + !(dst->flags & DST_NOCACHE)) + neigh_hh_init(neigh, dst, dst->ops->protocol); + + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + if (err >= 0) rc = neigh->ops->queue_xmit(skb); else -- cgit v1.1 From 0ed8ddf4045fcfcac36bad753dc4046118c603ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Oct 2010 10:44:07 +0000 Subject: neigh: Protect neigh->ha[] with a seqlock Add a seqlock in struct neighbour to protect neigh->ha[], and avoid dirtying neighbour in stress situation (many different flows / dsts) Dirtying takes place because of read_lock(&n->lock) and n->used writes. Switching to a seqlock, and writing n->used only on jiffies changes permits less dirtying. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2044906..b165b96 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -294,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; @@ -1015,7 +1016,7 @@ out_unlock_bh: } EXPORT_SYMBOL(__neigh_event_send); -static void neigh_update_hhs(struct neighbour *neigh) +static void neigh_update_hhs(const struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) @@ -1151,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - @@ -1214,6 +1217,7 @@ static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, { struct hh_cache *hh; + smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */ for (hh = n->hh; hh; hh = hh->hh_next) { if (hh->hh_type == protocol) { atomic_inc(&hh->hh_refcnt); @@ -1248,8 +1252,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, kfree(hh); return; } - read_unlock(&n->lock); - write_lock(&n->lock); + + write_lock_bh(&n->lock); /* must check if another thread already did the insert */ if (neigh_hh_lookup(n, dst, protocol)) { @@ -1263,13 +1267,13 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, hh->hh_output = n->ops->output; hh->hh_next = n->hh; + smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */ n->hh = hh; if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) hh_cache_put(hh); end: - write_unlock(&n->lock); - read_lock(&n->lock); + write_unlock_bh(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit @@ -1308,16 +1312,18 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; + unsigned int seq; - read_lock_bh(&neigh->lock); if (dev->header_ops->cache && !dst->hh && !(dst->flags & DST_NOCACHE)) neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = neigh->ops->queue_xmit(skb); @@ -1344,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; + unsigned int seq; __skb_pull(skb, skb_network_offset(skb)); - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + if (err >= 0) err = neigh->ops->queue_xmit(skb); else { @@ -2148,10 +2157,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; - if ((neigh->nud_state & NUD_VALID) && - nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) { - read_unlock_bh(&neigh->lock); - goto nla_put_failure; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); -- cgit v1.1 From fc66f95c68b6d4535a0ea2ea15d5cf626e310956 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Oct 2010 06:37:34 +0000 Subject: net dst: use a percpu_counter to track entries struct dst_ops tracks number of allocated dst in an atomic_t field, subject to high cache line contention in stress workload. Switch to a percpu_counter, to reduce number of time we need to dirty a central location. Place it on a separate cache line to avoid dirtying read only fields. Stress test : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE, SLUB/NUMA) Before: real 0m51.179s user 0m15.329s sys 10m15.942s After: real 0m45.570s user 0m15.525s sys 9m56.669s With a small reordering of struct neighbour fields, subject of a following patch, (to separate refcnt from other read mostly fields) real 0m41.841s user 0m15.261s sys 8m45.949s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 978a1ee..32e542d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops) { struct dst_entry *dst; - if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) return NULL; } @@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops) #if RT_CACHE_DEBUG >= 2 atomic_inc(&dst_total); #endif - atomic_inc(&ops->entries); + dst_entries_add(ops, 1); return dst; } EXPORT_SYMBOL(dst_alloc); @@ -236,7 +236,7 @@ again: neigh_release(neigh); } - atomic_dec(&dst->ops->entries); + dst_entries_add(dst->ops, -1); if (dst->ops->destroy) dst->ops->destroy(dst); -- cgit v1.1 From 29b4433d991c88d86ca48a4c1cc33c671475be4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 10:22:12 +0000 Subject: net: percpu net_device refcount We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 193eafa..04972a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5243,6 +5245,16 @@ out: } EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -5350,7 +5367,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); @@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (dev_addr_init(dev)) + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) goto free_tx; + if (dev_addr_init(dev)) + goto free_pcpu; + dev_mc_init(dev); dev_uc_init(dev); @@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_tx: kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); free_p: kfree(p); return NULL; @@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); -- cgit v1.1 From 564824b0c52c34692d804bb6ea214451615b0b50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 19:05:25 +0000 Subject: net: allocate skbs on local node commit b30973f877 (node-aware skb allocation) spread a wrong habit of allocating net drivers skbs on a given memory node : The one closest to the NIC hardware. This is wrong because as soon as we try to scale network stack, we need to use many cpus to handle traffic and hit slub/slab management on cross-node allocations/frees when these cpus have to alloc/free skbs bound to a central node. skb allocated in RX path are ephemeral, they have a very short lifetime : Extra cost to maintain NUMA affinity is too expensive. What appeared as a nice idea four years ago is in fact a bad one. In 2010, NIC hardwares are multiqueue, or we use RPS to spread the load, and two 10Gb NIC might deliver more than 28 million packets per second, needing all the available cpus. Cost of cross-node handling in network and vm stacks outperforms the small benefit hardware had when doing its DMA transfert in its 'local' memory node at RX time. Even trying to differentiate the two allocations done for one skb (the sk_buff on local node, the data part on NIC hardware node) is not enough to bring good performance. Signed-off-by: Eric Dumazet Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/skbuff.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 752c197..4e8b82e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -247,10 +247,9 @@ EXPORT_SYMBOL(__alloc_skb); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask) { - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; struct sk_buff *skb; - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; @@ -259,16 +258,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, } EXPORT_SYMBOL(__netdev_alloc_skb); -struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) -{ - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; - struct page *page; - - page = alloc_pages_node(node, gfp_mask, 0); - return page; -} -EXPORT_SYMBOL(__netdev_alloc_page); - void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size) { -- cgit v1.1 From a0a4a85a15df6335e3d11f83b2ac06ebebea313f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Oct 2010 04:43:04 +0000 Subject: fib: remove a useless synchronize_rcu() call fib_nl_delrule() calls synchronize_rcu() for no apparent reason, while rtnl is held. I suspect it was done to avoid an atomic_inc_not_zero() in fib_rules_lookup(), which commit 7fa7cb7109d07 added anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 21698f8..1bc3f25 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -494,7 +494,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } } - synchronize_rcu(); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); fib_rule_put(rule); -- cgit v1.1 From c2355e1ab910278a94d487b78590ee3c8eecd08a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 13 Oct 2010 16:01:49 +0000 Subject: bonding: Fix bonding drivers improper modification of netpoll structure The bonding driver currently modifies the netpoll structure in its xmit path while sending frames from netpoll. This is racy, as other cpus can access the netpoll structure in parallel. Since the bonding driver points np->dev to a slave device, other cpus can inadvertently attempt to send data directly to slave devices, leading to improper locking with the bonding master, lost frames, and deadlocks. This patch fixes that up. This patch also removes the real_dev pointer from the netpoll structure as that data is really only used by bonding in the poll_controller, and we can emulate its behavior by check each slave for IS_UP. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 537e01a..4e98ffa 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -288,11 +288,11 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, + struct net_device *dev) { int status = NETDEV_TX_BUSY; unsigned long tries; - struct net_device *dev = np->dev; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo = np->dev->npinfo; @@ -346,7 +346,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(netpoll_send_skb); +EXPORT_SYMBOL(netpoll_send_skb_on_dev); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.1 From 990c3d6f9c4115347659fc2b163907c8c832ae44 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 13 Oct 2010 16:01:51 +0000 Subject: bonding: Fix napi poll for bonding driver Usually the netpoll path, when preforming a napi poll can get away with just polling all the napi instances of the configured device. Thats not the case for the bonding driver however, as the napi instances which may wind up getting flagged as needing polling after the poll_controller call don't belong to the bonded device, but rather to the slave devices. Fix this by checking the device in question for the IFF_MASTER flag, if set, we know we need to check the full poll list for this cpu, rather than just the devices napi instance list. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4e98ffa..d79d221 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -156,8 +156,15 @@ static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int budget = 16; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct list_head *nlist; - list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (dev->flags & IFF_MASTER) + nlist = &sd->poll_list; + else + nlist = &dev->napi_list; + + list_for_each_entry(napi, nlist, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { budget = poll_one_napi(dev->npinfo, napi, budget); -- cgit v1.1 From f13d493d9cf772d510d78ae00bb9f4d680b3170b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 19 Oct 2010 07:04:26 +0000 Subject: netpoll: Revert napi_poll fix for bonding driver In an erlier patch I modified napi_poll so that devices with IFF_MASTER polled the per_cpu list instead of the device list for napi. I did this because the bonding driver has no napi instances to poll, it instead expects to check the slave devices napi instances, which napi_poll was unaware of. Looking at this more closely however, I now see this isn't strictly needed. As the bond driver poll_controller calls the slaves poll_controller via netpoll_poll_dev, which recursively calls poll_napi on each slave, allowing those napi instances to get serviced. The earlier patch isn't at all harmfull, its just not needed, so lets revert it to make the code cleaner. Sorry for the noise, Signed-off-by: Neil Horman Reviewed-by: WANG Cong Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d79d221..4e98ffa 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -156,15 +156,8 @@ static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int budget = 16; - struct softnet_data *sd = &__get_cpu_var(softnet_data); - struct list_head *nlist; - if (dev->flags & IFF_MASTER) - nlist = &sd->poll_list; - else - nlist = &dev->napi_list; - - list_for_each_entry(napi, nlist, dev_list) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { budget = poll_one_napi(dev->npinfo, napi, budget); -- cgit v1.1 From 55513fb4281464e97aa1ff2b9c906ca5aed917c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 17:55:58 +0000 Subject: net: fail alloc_netdev_mq if queue count < 1 In alloc_netdev_mq fail if requested queue_count < 1. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 04972a4..f44d29a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5511,6 +5511,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); + if (queue_count < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero queues.\n"); + return NULL; + } + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ -- cgit v1.1 From bd25fa7ba59cd26094319dfba0011b48465f7355 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:00:16 +0000 Subject: net: cleanups in RX queue allocation Clean up in RX queue allocation. In netif_set_real_num_rx_queues return error on attempt to set zero queues, or requested number is greater than number of allocated queues. In netif_alloc_rx_queues, do BUG_ON if queue_count is zero. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f44d29a..d33adec 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1583,12 +1583,12 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); - if (rxq > dev->num_rx_queues) - return -EINVAL; - rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) @@ -5013,25 +5013,23 @@ static int netif_alloc_rx_queues(struct net_device *dev) { #ifdef CONFIG_RPS unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; - if (count) { - struct netdev_rx_queue *rx; - - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - pr_err("netdev: Unable to allocate %u rx queues.\n", - count); - return -ENOMEM; - } - dev->_rx = rx; + BUG_ON(count < 1); - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ - for (i = 0; i < count; i++) - rx[i].first = rx; + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", count); + return -ENOMEM; } + dev->_rx = rx; + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < count; i++) + rx[i].first = rx; #endif return 0; } -- cgit v1.1 From e6484930d7c73d324bccda7d43d131088da697b9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:04:39 +0000 Subject: net: allocate tx queues in register_netdevice This patch introduces netif_alloc_netdev_queues which is called from register_device instead of alloc_netdev_mq. This makes TX queue allocation symmetric with RX allocation. Also, queue locks allocation is done in netdev_init_one_queue. Change set_real_num_tx_queues to fail if requested number < 1 or greater than number of allocated queues. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 106 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d33adec..4c3ac53 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1553,18 +1553,20 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. */ -void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { - unsigned int real_num = dev->real_num_tx_queues; + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; - if (unlikely(txq > dev->num_tx_queues)) - ; - else if (txq > real_num) - dev->real_num_tx_queues = txq; - else if (txq < real_num) { - dev->real_num_tx_queues = txq; - qdisc_reset_all_tx_gt(dev, txq); + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + if (txq < dev->real_num_tx_queues) + qdisc_reset_all_tx_gt(dev, txq); } + + dev->real_num_tx_queues = txq; + return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); @@ -4928,20 +4930,6 @@ static void rollback_registered(struct net_device *dev) rollback_registered_many(&single); } -static void __netdev_init_queue_locks_one(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - spin_lock_init(&dev_queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; -} - -static void netdev_init_queue_locks(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); -} - unsigned long netdev_fix_features(unsigned long features, const char *name) { /* Fix illegal SG+CSUM combinations. */ @@ -5034,6 +5022,41 @@ static int netif_alloc_rx_queues(struct net_device *dev) return 0; } +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + + BUG_ON(count < 1); + + tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + pr_err("netdev: Unable to allocate %u tx queues.\n", + count); + return -ENOMEM; + } + dev->_tx = tx; + return 0; +} + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, + void *_unused) +{ + queue->dev = dev; + + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; +} + +static void netdev_init_queues(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); +} + /** * register_netdevice - register a network device * @dev: device to register @@ -5067,7 +5090,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - netdev_init_queue_locks(dev); dev->iflink = -1; @@ -5075,6 +5097,12 @@ int register_netdevice(struct net_device *dev) if (ret) goto out; + ret = netif_alloc_netdev_queues(dev); + if (ret) + goto out; + + netdev_init_queues(dev); + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5456,19 +5484,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } EXPORT_SYMBOL(dev_get_stats); -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) -{ - queue->dev = dev; -} - -static void netdev_init_queues(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); - spin_lock_init(&dev->tx_global_lock); -} - struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -5480,7 +5495,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - __netdev_init_queue_locks_one(dev, queue, NULL); queue->qdisc = &noop_qdisc; queue->qdisc_sleeping = &noop_qdisc; rcu_assign_pointer(dev->ingress_queue, queue); @@ -5502,7 +5516,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { - struct netdev_queue *tx; struct net_device *dev; size_t alloc_size; struct net_device *p; @@ -5530,20 +5543,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return NULL; } - tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "tx qdiscs.\n"); - goto free_p; - } - - dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_tx; + goto free_p; if (dev_addr_init(dev)) goto free_pcpu; @@ -5553,7 +5558,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->_tx = tx; dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; @@ -5564,8 +5568,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; - netdev_init_queues(dev); - INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); dev->ethtool_ntuple_list.count = 0; INIT_LIST_HEAD(&dev->napi_list); @@ -5576,8 +5578,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; -free_tx: - kfree(tx); free_pcpu: free_percpu(dev->pcpu_refcnt); free_p: -- cgit v1.1 From 27b75c95f10d249574d9c4cb9dab878107faede8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Oct 2010 05:44:11 +0000 Subject: net: avoid RCU for NOCACHE dst There is no point using RCU for dst we allocate for a very short time (used once). Change dst_release() to take DST_NOCACHE into account, but also change skb_dst_set_noref() to force a refcount increment for such dst. This is a _huge_ gain, because we dont waste memory to store xx thousand of dsts. Instead of queueing them to RCU, we can free them instantly. CPU caches can stay hot, re-using same memory blocks to hold temporary dsts. Note : remove unneeded smp_mb__before_atomic_dec(); in dst_release(), since atomic_dec_return() implies a full memory barrier. Stress test, 160.000.000 udp frames sent, IP route cache disabled (DDOS). Before: real 0m38.091s user 0m13.189s sys 7m53.018s After: real 0m29.946s user 0m12.157s sys 7m40.605s For reference, if IP route cache was enabled : real 0m32.030s user 0m10.521s sys 8m15.243s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 32e542d..8abe628 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst) if (dst) { int newrefcnt; - smp_mb__before_atomic_dec(); newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } EXPORT_SYMBOL(dst_release); +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was not taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + /* If dst not in cache, we must take a reference, because + * dst_release() will destroy dst as soon as its refcount becomes zero + */ + if (unlikely(dst->flags & DST_NOCACHE)) { + dst_hold(dst); + skb_dst_set(skb, dst); + } else { + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; + } +} +EXPORT_SYMBOL(skb_dst_set_noref); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice -- cgit v1.1 From 7b9c60903714bf0a19d746b228864bad3497284e Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:04 +0000 Subject: vlan: Enable software emulation for vlan accleration. Currently users of hardware vlan accleration need to know whether the device supports it before generating packets. However, vlan acceleration will soon be available in a more flexible manner so knowing ahead of time becomes much more difficult. This adds a software fallback path for vlan packets on devices without the necessary offloading support, similar to other types of hardware accleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4c3ac53..1bfd96b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1694,7 +1694,12 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) { - if (can_checksum_protocol(dev->features, skb->protocol)) + int features = dev->features; + + if (vlan_tx_tag_present(skb)) + features &= dev->vlan_features; + + if (can_checksum_protocol(features, skb->protocol)) return true; if (skb->protocol == htons(ETH_P_8021Q)) { @@ -1793,6 +1798,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) __be16 type = skb->protocol; int err; + if (type == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh; + + if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + return ERR_PTR(-EINVAL); + + veh = (struct vlan_ethhdr *)skb->data; + type = veh->h_vlan_encapsulated_proto; + } + skb_reset_mac_header(skb); skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); @@ -1964,9 +1979,14 @@ static inline void skb_orphan_try(struct sk_buff *skb) static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { + int features = dev->features; + + if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb)) + features &= dev->vlan_features; + return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || + ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -1989,6 +2009,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_orphan_try(skb); + if (vlan_tx_tag_present(skb) && + !(dev->features & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + if (netif_needs_gso(dev, skb)) { if (unlikely(dev_gso_segment(skb))) goto out_kfree_skb; @@ -2050,6 +2079,7 @@ out_kfree_gso_skb: skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); +out: return rc; } -- cgit v1.1 From 3701e51382a026cba10c60b03efabe534fba4ca4 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:06 +0000 Subject: vlan: Centralize handling of hardware acceleration. Currently each driver that is capable of vlan hardware acceleration must be aware of the vlan groups that are configured and then pass the stripped tag to a specialized receive function. This is different from other types of hardware offload in that it places a significant amount of knowledge in the driver itself rather keeping it in the networking core. This makes vlan offloading function more similarly to other forms of offloading (such as checksum offloading or TSO) by doing the following: * On receive, stripped vlans are passed directly to the network core, without attempting to check for vlan groups or reconstructing the header if no group * vlans are made less special by folding the logic into the main receive routines * On transmit, the device layer will add the vlan header in software if the hardware doesn't support it, instead of spreading that logic out in upper layers, such as bonding. There are a number of advantages to this: * Fixes all bugs with drivers incorrectly dropping vlan headers at once. * Avoids having to disable VLAN acceleration when in promiscuous mode (good for bridging since it always puts devices in promiscuous mode). * Keeps VLAN tag separate until given to ultimate consumer, which avoids needing to do header reconstruction as in tg3 unless absolutely necessary. * Consolidates common code in core networking. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/dev.c | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1bfd96b..97fd6bc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2789,33 +2789,6 @@ out: } #endif -/* - * netif_nit_deliver - deliver received packets to network taps - * @skb: buffer - * - * This function is used to deliver incoming packets to network - * taps. It should be used when the normal netif_receive_skb path - * is bypassed, for example because of VLAN acceleration. - */ -void netif_nit_deliver(struct sk_buff *skb) -{ - struct packet_type *ptype; - - if (list_empty(&ptype_all)) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) - deliver_skb(skb, ptype, skb->dev); - } - rcu_read_unlock(); -} - /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for @@ -2925,9 +2898,6 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb)) - vlan_hwaccel_do_receive(skb); - /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) return NET_RX_DROP; @@ -2940,8 +2910,7 @@ static int __netif_receive_skb(struct sk_buff *skb) * be delivered to pkt handlers that are exact matches. Also * the deliver_no_wcard flag will be set. If packet handlers * are sensitive to duplicate packets these skbs will need to - * be dropped at the handler. The vlan accel path may have - * already set the deliver_no_wcard flag. + * be dropped at the handler. */ null_or_orig = NULL; orig_dev = skb->dev; @@ -3000,6 +2969,18 @@ ncls: goto out; } + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_hwaccel_do_receive(&skb)) { + ret = __netif_receive_skb(skb); + goto out; + } else if (unlikely(!skb)) + goto out; + } + /* * Make sure frames received on VLAN interfaces stacked on * bonding interfaces still make their way to any base bonding @@ -3264,6 +3245,7 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->same_flow = !diffs; @@ -3323,6 +3305,7 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; napi->skb = skb; } -- cgit v1.1 From d5dbda23804156ae6f35025ade5307a49d1db6d7 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:07 +0000 Subject: ethtool: Add support for vlan accleration. Now that vlan acceleration is handled consistently regardless of usage, it is possible to enable and disable it at will. This adds support for Ethtool operations that change the offloading status for debugging purposes, similar to other forms of hardware acceleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 685c700..956a9f4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -132,7 +132,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo); * NETIF_F_xxx values in include/linux/netdevice.h */ static const u32 flags_dup_features = - (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH); + (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | + ETH_FLAG_RXHASH); u32 ethtool_op_get_flags(struct net_device *dev) { -- cgit v1.1 From 8d8a0b1cc2a8f9794a3f1f747089b6a93774408d Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 15 Oct 2010 05:12:01 +0000 Subject: rtnetlink: remove rtnl_kill_links The function rtnl_kill_links is defined but never used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b2a718d..8121268 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -299,14 +299,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -{ - rtnl_lock(); - __rtnl_kill_links(net, ops); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(rtnl_kill_links); - /** * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister -- cgit v1.1 From d2ed817766987fd05e69b7da65d4861b38f1aa2a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 21 Oct 2010 04:06:29 -0700 Subject: net/core: Allow tagged VLAN packets to flow through VETH devices. When there are VLANs on a VETH device, the packets being transmitted through the VETH device may be 4 bytes bigger than MTU. A check in dev_forward_skb did not take this into account and so dropped these packets. This patch is needed at least as far back as 2.6.34.7 and should be considered for -stable. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 660dd41..8e07109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1485,7 +1485,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) nf_reset(skb); if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN))) { kfree_skb(skb); return NET_RX_DROP; } -- cgit v1.1 From a5c30b349b872aa2ac13babbd5ceb26737f17e95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 06:04:42 +0000 Subject: net/neighbour: cancel_delayed_work() + flush_scheduled_work() -> cancel_delayed_work_sync() flush_scheduled_work() is going away. Prepare for it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b165b96..8cc8f9a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1588,8 +1588,7 @@ int neigh_table_clear(struct neigh_table *tbl) struct neigh_table **tp; /* It is not clean... Fix it to unload IPv6 module safely */ - cancel_delayed_work(&tbl->gc_work); - flush_scheduled_work(); + cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); neigh_ifdown(tbl, NULL); -- cgit v1.1 From d0c2b0d265a0f1f92922a99a31def9da582197ac Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 19 Oct 2010 07:12:10 +0000 Subject: napi: unexport napi_reuse_skb The function napi_reuse_skb is only used inside core. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 97fd6bc..b2269ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3301,7 +3301,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_gro_receive); -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); @@ -3309,7 +3309,6 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) napi->skb = skb; } -EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { -- cgit v1.1