From 0a9627f2649a02bea165cfd529d7bcb625c2fcad Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 16 Mar 2010 08:03:29 +0000 Subject: rps: Receive Packet Steering This patch implements software receive side packet steering (RPS). RPS distributes the load of received packet processing across multiple CPUs. Problem statement: Protocol processing done in the NAPI context for received packets is serialized per device queue and becomes a bottleneck under high packet load. This substantially limits pps that can be achieved on a single queue NIC and provides no scaling with multiple cores. This solution queues packets early on in the receive path on the backlog queues of other CPUs. This allows protocol processing (e.g. IP and TCP) to be performed on packets in parallel. For each device (or each receive queue in a multi-queue device) a mask of CPUs is set to indicate the CPUs that can process packets. A CPU is selected on a per packet basis by hashing contents of the packet header (e.g. the TCP or UDP 4-tuple) and using the result to index into the CPU mask. The IPI mechanism is used to raise networking receive softirqs between CPUs. This effectively emulates in software what a multi-queue NIC can provide, but is generic requiring no device support. Many devices now provide a hash over the 4-tuple on a per packet basis (e.g. the Toeplitz hash). This patch allow drivers to set the HW reported hash in an skb field, and that value in turn is used to index into the RPS maps. Using the HW generated hash can avoid cache misses on the packet when steering it to a remote CPU. The CPU mask is set on a per device and per queue basis in the sysfs variable /sys/class/net//queues/rx-/rps_cpus. This is a set of canonical bit maps for receive queues in the device (numbered by ). If a device does not support multi-queue, a single variable is used for the device (rx-0). Generally, we have found this technique increases pps capabilities of a single queue device with good CPU utilization. Optimal settings for the CPU mask seem to depend on architectures and cache hierarcy. Below are some results running 500 instances of netperf TCP_RR test with 1 byte req. and resp. Results show cumulative transaction rate and system CPU utilization. e1000e on 8 core Intel Without RPS: 108K tps at 33% CPU With RPS: 311K tps at 64% CPU forcedeth on 16 core AMD Without RPS: 156K tps at 15% CPU With RPS: 404K tps at 49% CPU bnx2x on 16 core AMD Without RPS 567K tps at 61% CPU (4 HW RX queues) Without RPS 738K tps at 96% CPU (8 HW RX queues) With RPS: 854K tps at 76% CPU (4 HW RX queues) Caveats: - The benefits of this patch are dependent on architecture and cache hierarchy. Tuning the masks to get best performance is probably necessary. - This patch adds overhead in the path for processing a single packet. In a lightly loaded server this overhead may eliminate the advantages of increased parallelism, and possibly cause some relative performance degradation. We have found that masks that are cache aware (share same caches with the interrupting CPU) mitigate much of this. - The RPS masks can be changed dynamically, however whenever the mask is changed this introduces the possibility of generating out of order packets. It's probably best not change the masks too frequently. Signed-off-by: Tom Herbert include/linux/netdevice.h | 32 ++++- include/linux/skbuff.h | 3 + net/core/dev.c | 335 +++++++++++++++++++++++++++++++++++++-------- net/core/net-sysfs.c | 225 ++++++++++++++++++++++++++++++- net/core/skbuff.c | 2 + 5 files changed, 538 insertions(+), 59 deletions(-) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 335 ++++++++++++++++++++++++++++++++++++++++++--------- net/core/net-sysfs.c | 225 +++++++++++++++++++++++++++++++++- net/core/skbuff.c | 2 + 3 files changed, 505 insertions(+), 57 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bcc490c..17b1686 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1931,7 +1931,7 @@ out_kfree_skb: return rc; } -static u32 skb_tx_hashrnd; +static u32 hashrnd __read_mostly; u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { @@ -1949,7 +1949,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) else hash = skb->protocol; - hash = jhash_1word(hash, skb_tx_hashrnd); + hash = jhash_1word(hash, hashrnd); return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } @@ -1959,10 +1959,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) { if (unlikely(queue_index >= dev->real_num_tx_queues)) { if (net_ratelimit()) { - WARN(1, "%s selects TX queue %d, but " + netdev_warn(dev, "selects TX queue %d, but " "real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); + queue_index, dev->real_num_tx_queues); } return 0; } @@ -2175,6 +2174,172 @@ int weight_p __read_mostly = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) +{ + struct ipv6hdr *ip6; + struct iphdr *ip; + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + int cpu = -1; + u8 ip_proto; + u32 addr1, addr2, ports, ihl; + + rcu_read_lock(); + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + if (net_ratelimit()) { + netdev_warn(dev, "received packet on queue " + "%u, but number of RX queues is %u\n", + index, dev->num_rx_queues); + } + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map) + goto done; + + if (skb->rxhash) + goto got_hash; /* Skip hash computation on packet header */ + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(*ip))) + goto done; + + ip = (struct iphdr *) skb->data; + ip_proto = ip->protocol; + addr1 = ip->saddr; + addr2 = ip->daddr; + ihl = ip->ihl; + break; + case __constant_htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(*ip6))) + goto done; + + ip6 = (struct ipv6hdr *) skb->data; + ip_proto = ip6->nexthdr; + addr1 = ip6->saddr.s6_addr32[3]; + addr2 = ip6->daddr.s6_addr32[3]; + ihl = (40 >> 2); + break; + default: + goto done; + } + ports = 0; + switch (ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_AH: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + if (pskb_may_pull(skb, (ihl * 4) + 4)) + ports = *((u32 *) (skb->data + (ihl * 4))); + break; + + default: + break; + } + + skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd); + if (!skb->rxhash) + skb->rxhash = 1; + +got_hash: + map = rcu_dereference(rxqueue->rps_map); + if (map) { + u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + rcu_read_unlock(); + return cpu; +} + +/* + * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled + * to be sent to kick remote softirq processing. There are two masks since + * the sending of IPIs must be done with interrupts enabled. The select field + * indicates the current mask that enqueue_backlog uses to schedule IPIs. + * select is flipped before net_rps_action is called while still under lock, + * net_rps_action then uses the non-selected mask to send the IPIs and clears + * it without conflicting with enqueue_backlog operation. + */ +struct rps_remote_softirq_cpus { + cpumask_t mask[2]; + int select; +}; +static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus); + +/* Called from hardirq (IPI) context */ +static void trigger_softirq(void *data) +{ + struct softnet_data *queue = data; + __napi_schedule(&queue->backlog); + __get_cpu_var(netdev_rx_stat).received_rps++; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu) +{ + struct softnet_data *queue; + unsigned long flags; + + queue = &per_cpu(softnet_data, cpu); + + local_irq_save(flags); + __get_cpu_var(netdev_rx_stat).total++; + + spin_lock(&queue->input_pkt_queue.lock); + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { + if (queue->input_pkt_queue.qlen) { +enqueue: + __skb_queue_tail(&queue->input_pkt_queue, skb); + spin_unlock_irqrestore(&queue->input_pkt_queue.lock, + flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device */ + if (napi_schedule_prep(&queue->backlog)) { + if (cpu != smp_processor_id()) { + struct rps_remote_softirq_cpus *rcpus = + &__get_cpu_var(rps_remote_softirq_cpus); + + cpu_set(cpu, rcpus->mask[rcpus->select]); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + } else + __napi_schedule(&queue->backlog); + } + goto enqueue; + } + + spin_unlock(&queue->input_pkt_queue.lock); + + __get_cpu_var(netdev_rx_stat).dropped++; + local_irq_restore(flags); + + kfree_skb(skb); + return NET_RX_DROP; +} /** * netif_rx - post buffer to the network code @@ -2193,8 +2358,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; int netif_rx(struct sk_buff *skb) { - struct softnet_data *queue; - unsigned long flags; + int cpu; /* if netpoll wants it, pretend we never saw it */ if (netpoll_rx(skb)) @@ -2203,31 +2367,11 @@ int netif_rx(struct sk_buff *skb) if (!skb->tstamp.tv64) net_timestamp(skb); - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - queue = &__get_cpu_var(softnet_data); - - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { -enqueue: - __skb_queue_tail(&queue->input_pkt_queue, skb); - local_irq_restore(flags); - return NET_RX_SUCCESS; - } - - napi_schedule(&queue->backlog); - goto enqueue; - } - - __get_cpu_var(netdev_rx_stat).dropped++; - local_irq_restore(flags); + cpu = get_rps_cpu(skb->dev, skb); + if (cpu < 0) + cpu = smp_processor_id(); - kfree_skb(skb); - return NET_RX_DROP; + return enqueue_to_backlog(skb, cpu); } EXPORT_SYMBOL(netif_rx); @@ -2464,22 +2608,7 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } -/** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. - * - * This function may only be called from softirq context and interrupts - * should be enabled. - * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; @@ -2588,6 +2717,33 @@ out: rcu_read_unlock(); return ret; } + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + int cpu; + + cpu = get_rps_cpu(skb->dev, skb); + + if (cpu < 0) + return __netif_receive_skb(skb); + else + return enqueue_to_backlog(skb, cpu); +} EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending */ @@ -2914,16 +3070,16 @@ static int process_backlog(struct napi_struct *napi, int quota) do { struct sk_buff *skb; - local_irq_disable(); + spin_lock_irq(&queue->input_pkt_queue.lock); skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { __napi_complete(napi); - local_irq_enable(); + spin_unlock_irq(&queue->input_pkt_queue.lock); break; } - local_irq_enable(); + spin_unlock_irq(&queue->input_pkt_queue.lock); - netif_receive_skb(skb); + __netif_receive_skb(skb); } while (++work < quota && jiffies == start_time); return work; @@ -3012,6 +3168,22 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); +/* + * net_rps_action sends any pending IPI's for rps. This is only called from + * softirq and interrupts must be enabled. + */ +static void net_rps_action(cpumask_t *mask) +{ + int cpu; + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + for_each_cpu_mask_nr(cpu, *mask) { + struct softnet_data *queue = &per_cpu(softnet_data, cpu); + if (cpu_online(cpu)) + __smp_call_function_single(cpu, &queue->csd, 0); + } + cpus_clear(*mask); +} static void net_rx_action(struct softirq_action *h) { @@ -3019,6 +3191,8 @@ static void net_rx_action(struct softirq_action *h) unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; + int select; + struct rps_remote_softirq_cpus *rcpus; local_irq_disable(); @@ -3081,8 +3255,14 @@ static void net_rx_action(struct softirq_action *h) netpoll_poll_unlock(have); } out: + rcpus = &__get_cpu_var(rps_remote_softirq_cpus); + select = rcpus->select; + rcpus->select ^= 1; + local_irq_enable(); + net_rps_action(&rcpus->mask[select]); + #ifdef CONFIG_NET_DMA /* * There may not be any more sk_buffs coming right now, so push @@ -3327,10 +3507,10 @@ static int softnet_seq_show(struct seq_file *seq, void *v) { struct netif_rx_stats *s = v; - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", s->total, s->dropped, s->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision); + s->cpu_collision, s->received_rps); return 0; } @@ -5067,6 +5247,23 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; + if (!dev->num_rx_queues) { + /* + * Allocate a single RX queue if driver never called + * alloc_netdev_mq + */ + + dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!dev->_rx) { + ret = -ENOMEM; + goto out; + } + + dev->_rx->first = dev->_rx; + atomic_set(&dev->_rx->count, 1); + dev->num_rx_queues = 1; + } + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5424,9 +5621,11 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { struct netdev_queue *tx; + struct netdev_rx_queue *rx; struct net_device *dev; size_t alloc_size; struct net_device *p; + int i; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5452,11 +5651,27 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } + rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + printk(KERN_ERR "alloc_netdev: Unable to allocate " + "rx queues.\n"); + goto free_tx; + } + + atomic_set(&rx->count, queue_count); + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < queue_count; i++) + rx[i].first = rx; + dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) - goto free_tx; + goto free_rx; dev_unicast_init(dev); @@ -5466,6 +5681,9 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; + dev->_rx = rx; + dev->num_rx_queues = queue_count; + dev->gso_max_size = GSO_MAX_SIZE; netdev_init_queues(dev); @@ -5480,9 +5698,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; +free_rx: + kfree(rx); free_tx: kfree(tx); - free_p: kfree(p); return NULL; @@ -5985,6 +6204,10 @@ static int __init net_dev_init(void) queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); + queue->csd.func = trigger_softirq; + queue->csd.info = queue; + queue->csd.flags = 0; + queue->backlog.poll = process_backlog; queue->backlog.weight = weight_p; queue->backlog.gro_list = NULL; @@ -6023,7 +6246,7 @@ subsys_initcall(net_dev_init); static int __init initialize_hashrnd(void) { - get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); + get_random_bytes(&hashrnd, sizeof(hashrnd)); return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 099c753..7a46343 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -466,6 +466,216 @@ static struct attribute_group wireless_group = { }; #endif +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_rx_queue_attr(_attr) container_of(_attr, \ + struct rx_queue_attribute, attr) + +#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) + +static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static struct sysfs_ops rx_queue_sysfs_ops = { + .show = rx_queue_attr_show, + .store = rx_queue_attr_store, +}; + +static ssize_t show_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + struct rps_map *map; + cpumask_var_t mask; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map) + for (i = 0; i < map->len; i++) + cpumask_set_cpu(map->cpus[i], mask); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + rcu_read_unlock(); + free_cpumask_var(mask); + return -EINVAL; + } + rcu_read_unlock(); + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static void rps_map_release(struct rcu_head *rcu) +{ + struct rps_map *map = container_of(rcu, struct rps_map, rcu); + + kfree(map); +} + +ssize_t store_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_SPINLOCK(rps_map_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + map = kzalloc(max_t(unsigned, + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + + if (i) + map->len = i; + else { + kfree(map); + map = NULL; + } + + spin_lock(&rps_map_lock); + old_map = queue->rps_map; + rcu_assign_pointer(queue->rps_map, map); + spin_unlock(&rps_map_lock); + + if (old_map) + call_rcu(&old_map->rcu, rps_map_release); + + free_cpumask_var(mask); + return len; +} + +static struct rx_queue_attribute rps_cpus_attribute = + __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + +static struct attribute *rx_queue_default_attrs[] = { + &rps_cpus_attribute.attr, + NULL +}; + +static void rx_queue_release(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map = queue->rps_map; + struct netdev_rx_queue *first = queue->first; + + if (map) + call_rcu(&map->rcu, rps_map_release); + + if (atomic_dec_and_test(&first->count)) + kfree(first); +} + +static struct kobj_type rx_queue_ktype = { + .sysfs_ops = &rx_queue_sysfs_ops, + .release = rx_queue_release, + .default_attrs = rx_queue_default_attrs, +}; + +static int rx_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_rx_queue *queue = net->_rx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, + "rx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + + return error; +} + +static int rx_queue_register_kobjects(struct net_device *net) +{ + int i; + int error = 0; + + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; + for (i = 0; i < net->num_rx_queues; i++) { + error = rx_queue_add_kobject(net, i); + if (error) + break; + } + + if (error) + while (--i >= 0) + kobject_put(&net->_rx[i].kobj); + + return error; +} + +static void rx_queue_remove_kobjects(struct net_device *net) +{ + int i; + + for (i = 0; i < net->num_rx_queues; i++) + kobject_put(&net->_rx[i].kobj); + kset_unregister(net->queues_kset); +} + #endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG @@ -529,6 +739,8 @@ void netdev_unregister_kobject(struct net_device * net) if (!net_eq(dev_net(net), &init_net)) return; + rx_queue_remove_kobjects(net); + device_del(dev); } @@ -537,6 +749,7 @@ int netdev_register_kobject(struct net_device *net) { struct device *dev = &(net->dev); const struct attribute_group **groups = net->sysfs_groups; + int error = 0; dev->class = &net_class; dev->platform_data = net; @@ -563,7 +776,17 @@ int netdev_register_kobject(struct net_device *net) if (!net_eq(dev_net(net), &init_net)) return 0; - return device_add(dev); + error = device_add(dev); + if (error) + return error; + + error = rx_queue_register_kobjects(net); + if (error) { + device_del(dev); + return error; + } + + return error; } int netdev_class_create_file(struct class_attribute *class_attr) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 93c4e06..bdea0ef 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -534,6 +534,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->network_header = old->network_header; new->mac_header = old->mac_header; skb_dst_set(new, dst_clone(skb_dst(old))); + new->rxhash = old->rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -581,6 +582,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(len); C(data_len); C(mac_len); + C(rxhash); n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; -- cgit v1.1 From 2fb3573dfbca0bd853ddc1e47617eb446fa3deae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Mar 2010 20:03:38 +0000 Subject: net: remove rcu locking from fib_rules_event() We hold RTNL at this point and dont use RCU variants of list traversals, we dont need rcu_read_lock()/rcu_read_unlock() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a24377..2ff3489 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -108,7 +108,7 @@ fib_rules_register(struct fib_rules_ops *tmpl, struct net *net) struct fib_rules_ops *ops; int err; - ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL); + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); @@ -123,7 +123,6 @@ fib_rules_register(struct fib_rules_ops *tmpl, struct net *net) return ops; } - EXPORT_SYMBOL_GPL(fib_rules_register); void fib_rules_cleanup_ops(struct fib_rules_ops *ops) @@ -157,7 +156,6 @@ void fib_rules_unregister(struct fib_rules_ops *ops) call_rcu(&ops->rcu, fib_rules_put_rcu); } - EXPORT_SYMBOL_GPL(fib_rules_unregister); static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, @@ -220,7 +218,6 @@ out: return err; } - EXPORT_SYMBOL_GPL(fib_rules_lookup); static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, @@ -613,7 +610,7 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) break; cb->args[1] = 0; - skip: +skip: idx++; } rcu_read_unlock(); @@ -685,7 +682,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct fib_rules_ops *ops; ASSERT_RTNL(); - rcu_read_lock(); switch (event) { case NETDEV_REGISTER: @@ -699,8 +695,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, break; } - rcu_read_unlock(); - return NOTIFY_DONE; } -- cgit v1.1 From 10708f37ae729baba9b67bd134c3720709d4ae62 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 11 Mar 2010 09:57:29 +0000 Subject: net: core: add IFLA_STATS64 support `ip -s link` shows interface counters truncated to 32 bit. This is because interface statistics are transported only in 32-bit quantity to userspace. This commit adds a new IFLA_STATS64 attribute that exports them in full 64 bit. References: http://lkml.indiana.edu/hypermail/linux/kernel/0307.3/0215.html Signed-off-by: Jan Engelhardt Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4568120..e1121f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -600,7 +600,39 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; -}; +} + +static void copy_rtnl_link_stats64(struct rtnl_link_stats64 *a, + const struct net_device_stats *b) +{ + a->rx_packets = b->rx_packets; + a->tx_packets = b->tx_packets; + a->rx_bytes = b->rx_bytes; + a->tx_bytes = b->tx_bytes; + a->rx_errors = b->rx_errors; + a->tx_errors = b->tx_errors; + a->rx_dropped = b->rx_dropped; + a->tx_dropped = b->tx_dropped; + + a->multicast = b->multicast; + a->collisions = b->collisions; + + a->rx_length_errors = b->rx_length_errors; + a->rx_over_errors = b->rx_over_errors; + a->rx_crc_errors = b->rx_crc_errors; + a->rx_frame_errors = b->rx_frame_errors; + a->rx_fifo_errors = b->rx_fifo_errors; + a->rx_missed_errors = b->rx_missed_errors; + + a->tx_aborted_errors = b->tx_aborted_errors; + a->tx_carrier_errors = b->tx_carrier_errors; + a->tx_fifo_errors = b->tx_fifo_errors; + a->tx_heartbeat_errors = b->tx_heartbeat_errors; + a->tx_window_errors = b->tx_window_errors; + + a->rx_compressed = b->rx_compressed; + a->tx_compressed = b->tx_compressed; +} static inline int rtnl_vfinfo_size(const struct net_device *dev) { @@ -698,6 +730,14 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, stats = dev_get_stats(dev); copy_rtnl_link_stats(nla_data(attr), stats); + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (attr == NULL) + goto nla_put_failure; + + stats = dev_get_stats(dev); + copy_rtnl_link_stats64(nla_data(attr), stats); + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { int i; struct ifla_vf_info ivi; -- cgit v1.1 From 1e94d72feab025b8f7c55d07020602f82f3a97dd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 18 Mar 2010 17:45:44 -0700 Subject: rps: Fixed build with CONFIG_SMP not enabled. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 17b1686..1a7e1d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2174,6 +2174,7 @@ int weight_p __read_mostly = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +#ifdef CONFIG_SMP /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. @@ -2293,6 +2294,7 @@ static void trigger_softirq(void *data) __napi_schedule(&queue->backlog); __get_cpu_var(netdev_rx_stat).received_rps++; } +#endif /* CONFIG_SMP */ /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog @@ -2320,6 +2322,7 @@ enqueue: /* Schedule NAPI for backlog device */ if (napi_schedule_prep(&queue->backlog)) { +#ifdef CONFIG_SMP if (cpu != smp_processor_id()) { struct rps_remote_softirq_cpus *rcpus = &__get_cpu_var(rps_remote_softirq_cpus); @@ -2328,6 +2331,9 @@ enqueue: __raise_softirq_irqoff(NET_RX_SOFTIRQ); } else __napi_schedule(&queue->backlog); +#else + __napi_schedule(&queue->backlog); +#endif } goto enqueue; } @@ -2367,9 +2373,13 @@ int netif_rx(struct sk_buff *skb) if (!skb->tstamp.tv64) net_timestamp(skb); +#ifdef CONFIG_SMP cpu = get_rps_cpu(skb->dev, skb); if (cpu < 0) cpu = smp_processor_id(); +#else + cpu = smp_processor_id(); +#endif return enqueue_to_backlog(skb, cpu); } @@ -2735,6 +2745,7 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { +#ifdef CONFIG_SMP int cpu; cpu = get_rps_cpu(skb->dev, skb); @@ -2743,6 +2754,9 @@ int netif_receive_skb(struct sk_buff *skb) return __netif_receive_skb(skb); else return enqueue_to_backlog(skb, cpu); +#else + return __netif_receive_skb(skb); +#endif } EXPORT_SYMBOL(netif_receive_skb); @@ -3168,6 +3182,7 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); +#ifdef CONFIG_SMP /* * net_rps_action sends any pending IPI's for rps. This is only called from * softirq and interrupts must be enabled. @@ -3184,6 +3199,7 @@ static void net_rps_action(cpumask_t *mask) } cpus_clear(*mask); } +#endif static void net_rx_action(struct softirq_action *h) { @@ -3191,8 +3207,10 @@ static void net_rx_action(struct softirq_action *h) unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; +#ifdef CONFIG_SMP int select; struct rps_remote_softirq_cpus *rcpus; +#endif local_irq_disable(); @@ -3255,6 +3273,7 @@ static void net_rx_action(struct softirq_action *h) netpoll_poll_unlock(have); } out: +#ifdef CONFIG_SMP rcpus = &__get_cpu_var(rps_remote_softirq_cpus); select = rcpus->select; rcpus->select ^= 1; @@ -3262,6 +3281,9 @@ out: local_irq_enable(); net_rps_action(&rcpus->mask[select]); +#else + local_irq_enable(); +#endif #ifdef CONFIG_NET_DMA /* @@ -6204,9 +6226,11 @@ static int __init net_dev_init(void) queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); +#ifdef CONFIG_SMP queue->csd.func = trigger_softirq; queue->csd.info = queue; queue->csd.flags = 0; +#endif queue->backlog.poll = process_backlog; queue->backlog.weight = weight_p; -- cgit v1.1 From 3ca5b4042ecae5e73c59de62e4ac0db31c10e0f8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 10 Mar 2010 10:29:35 +0000 Subject: bonding: check return value of nofitier when changing type This patch adds the possibility to refuse the bonding type change for other subsystems (such as for example bridge, vlan, etc.) Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1a7e1d1..d1f027c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1084,9 +1084,9 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); -void netdev_bonding_change(struct net_device *dev, unsigned long event) +int netdev_bonding_change(struct net_device *dev, unsigned long event) { - call_netdevice_notifiers(event, dev); + return call_netdevice_notifiers(event, dev); } EXPORT_SYMBOL(netdev_bonding_change); -- cgit v1.1 From 755d0e77ac9c8d125388922dc33434ed5b2ebe80 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 19 Mar 2010 04:42:24 +0000 Subject: net: rtnetlink: ignore NETDEV_PRE_TYPE_CHANGE in rtnetlink_event() Ignore the new NETDEV_PRE_TYPE_CHANGE event in rtnetlink_event() since there have been no changes userspace needs to be notified of. Also add a comment to the netdev notifier event definitions to remind people to update the exclusion list when adding new event types. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e1121f0..ffc6cf3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1513,6 +1513,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_POST_INIT: case NETDEV_REGISTER: case NETDEV_CHANGE: + case NETDEV_PRE_TYPE_CHANGE: case NETDEV_GOING_DOWN: case NETDEV_UNREGISTER: case NETDEV_UNREGISTER_BATCH: -- cgit v1.1 From 32a806c194ea112cfab00f558482dd97bee5e44e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 19 Mar 2010 04:00:23 +0000 Subject: bonding: flush unicast and multicast lists when changing type After the type change, addresses in unicast and multicast lists wouldn't make sense, not to mention possible different lenghts. So flush both lists here. Note "dev_addr_discard" will be very soon replaced by "dev_mc_flush" (once mc_list conversion will be done). Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c0e2608..fe2a754 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4457,12 +4457,13 @@ void dev_unicast_unsync(struct net_device *to, struct net_device *from) } EXPORT_SYMBOL(dev_unicast_unsync); -static void dev_unicast_flush(struct net_device *dev) +void dev_unicast_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->uc); netif_addr_unlock_bh(dev); } +EXPORT_SYMBOL(dev_unicast_flush); static void dev_unicast_init(struct net_device *dev) { @@ -4484,7 +4485,7 @@ static void __dev_addr_discard(struct dev_addr_list **list) } } -static void dev_addr_discard(struct net_device *dev) +void dev_addr_discard(struct net_device *dev) { netif_addr_lock_bh(dev); @@ -4493,6 +4494,7 @@ static void dev_addr_discard(struct net_device *dev) netif_addr_unlock_bh(dev); } +EXPORT_SYMBOL(dev_addr_discard); /** * dev_get_flags - get flags reported to userspace -- cgit v1.1 From 283f2fe87e980d8af5ad8aa63751e7e3258ee05a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Mar 2010 13:37:40 +0000 Subject: net: speedup netdev_set_master() We currently force a synchronize_net() in netdev_set_master() This seems necessary only when a slave had a master and we dismantle it. In the other case ("ifenslave bond0 ethO"), we dont need this long delay. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fe2a754..2d01f18 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3757,11 +3757,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) slave->master = master; - synchronize_net(); - - if (old) + if (old) { + synchronize_net(); dev_put(old); - + } if (master) slave->flags |= IFF_SLAVE; else -- cgit v1.1 From 99fe3c391d50d381687fd84ed0ab22d57079e41f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Mar 2010 11:27:25 +0000 Subject: net: dev_getfirstbyhwtype() optimization Use RCU to avoid RTNL use in dev_getfirstbyhwtype() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2d01f18..a03aab4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -772,14 +772,17 @@ EXPORT_SYMBOL(__dev_getfirstbyhwtype); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { - struct net_device *dev; + struct net_device *dev, *ret = NULL; - rtnl_lock(); - dev = __dev_getfirstbyhwtype(net, type); - if (dev) - dev_hold(dev); - rtnl_unlock(); - return dev; + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + if (dev->type == type) { + dev_hold(dev); + ret = dev; + break; + } + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); -- cgit v1.1 From e99b99b471c21b071132e51bb7aa6b7a8796dc02 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 18 Mar 2010 22:44:30 +0000 Subject: pktgen node allocation Here is patch to manipulate packet node allocation and implicitly how packets are DMA'd etc. The flag NODE_ALLOC enables the function and numa_node_id(); when enabled it can also be explicitly controlled via a new node parameter Tested this with 10 Intel 82599 ports w. TYAN S7025 E5520 CPU's. Was able to TX/DMA ~80 Gbit/s to Ethernet wires. Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/core/pktgen.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 4392381..2ad68da 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -169,7 +169,7 @@ #include #include /* do_div */ -#define VERSION "2.72" +#define VERSION "2.73" #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) @@ -190,6 +190,7 @@ #define F_IPSEC_ON (1<<12) /* ipsec on for flows */ #define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ +#define F_NODE (1<<15) /* Node memory alloc*/ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -372,6 +373,7 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; + int node; /* Memory node */ #ifdef CONFIG_XFRM __u8 ipsmode; /* IPSEC mode (config) */ @@ -607,6 +609,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->traffic_class) seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + if (pkt_dev->node >= 0) + seq_printf(seq, " node: %d\n", pkt_dev->node); + seq_printf(seq, " Flags: "); if (pkt_dev->flags & F_IPV6) @@ -660,6 +665,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_SVID_RND) seq_printf(seq, "SVID_RND "); + if (pkt_dev->flags & F_NODE) + seq_printf(seq, "NODE_ALLOC "); + seq_puts(seq, "\n"); /* not really stopped, more like last-running-at */ @@ -1074,6 +1082,21 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->dst_mac_count); return count; } + if (!strcmp(name, "node")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + + if (node_possible(value)) { + pkt_dev->node = value; + sprintf(pg_result, "OK: node=%d", pkt_dev->node); + } + else + sprintf(pg_result, "ERROR: node not possible"); + return count; + } if (!strcmp(name, "flag")) { char f[32]; memset(f, 0, 32); @@ -1166,12 +1189,18 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!IPV6") == 0) pkt_dev->flags &= ~F_IPV6; + else if (strcmp(f, "NODE_ALLOC") == 0) + pkt_dev->flags |= F_NODE; + + else if (strcmp(f, "!NODE_ALLOC") == 0) + pkt_dev->flags &= ~F_NODE; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " - "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n"); + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); return count; } sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); @@ -2572,9 +2601,27 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); datalen = (odev->hard_header_len + 16) & ~0xf; - skb = __netdev_alloc_skb(odev, - pkt_dev->cur_pkt_size + 64 - + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); + + if (pkt_dev->flags & F_NODE) { + int node; + + if (pkt_dev->node >= 0) + node = pkt_dev->node; + else + node = numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = odev; + } + } + else + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); + if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; @@ -3674,6 +3721,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; + pkt_dev->node = -1; err = pktgen_setup_dev(pkt_dev, ifname); if (err) -- cgit v1.1 From e880eb6c5c9d98e389ffc0d8947f75d70785361a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 22 Mar 2010 18:06:47 -0700 Subject: rps: Fix build with CONFIG_SYSFS enabled Fix build with CONFIG_SYSFS not enabled. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7a46343..f6b6bfe 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -739,7 +739,9 @@ void netdev_unregister_kobject(struct net_device * net) if (!net_eq(dev_net(net), &init_net)) return; +#ifdef CONFIG_SYSFS rx_queue_remove_kobjects(net); +#endif device_del(dev); } @@ -780,11 +782,13 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; +#ifdef CONFIG_SYSFS error = rx_queue_register_kobjects(net); if (error) { device_del(dev); return error; } +#endif return error; } -- cgit v1.1 From e51d739ab79110c43ca03daf3ddb3c52dadd38b7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 23 Mar 2010 13:39:19 +0000 Subject: net: Fix locking in flush_backlog Need to take spinlocks when dequeuing from input_pkt_queue in flush_backlog. Also, flush_backlog can now be called directly from netdev_run_todo. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a03aab4..5e3dc28 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2766,17 +2766,19 @@ int netif_receive_skb(struct sk_buff *skb) EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending */ -static void flush_backlog(void *arg) +static void flush_backlog(struct net_device *dev, int cpu) { - struct net_device *dev = arg; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue = &per_cpu(softnet_data, cpu); struct sk_buff *skb, *tmp; + unsigned long flags; + spin_lock_irqsave(&queue->input_pkt_queue.lock, flags); skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) if (skb->dev == dev) { __skb_unlink(skb, &queue->input_pkt_queue); kfree_skb(skb); } + spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags); } static int napi_gro_complete(struct sk_buff *skb) @@ -5545,6 +5547,7 @@ void netdev_run_todo(void) while (!list_empty(&list)) { struct net_device *dev = list_first_entry(&list, struct net_device, todo_list); + int i; list_del(&dev->todo_list); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { @@ -5556,7 +5559,8 @@ void netdev_run_todo(void) dev->reg_state = NETREG_UNREGISTERED; - on_each_cpu(flush_backlog, dev, 1); + for_each_online_cpu(i) + flush_backlog(dev, i); netdev_wait_allrefs(dev); -- cgit v1.1 From df3345457a7a174dfb5872a070af80d456985038 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Mar 2010 19:13:54 +0000 Subject: rps: add CONFIG_RPS RPS currently depends on SMP and SYSFS Adding a CONFIG_RPS makes sense in case this requirement changes in the future. This patch saves about 1500 bytes of kernel text in case SMP is on but SYSFS is off. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5e3dc28..bcb3ed2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,7 +2177,7 @@ int weight_p __read_mostly = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. @@ -2325,7 +2325,7 @@ enqueue: /* Schedule NAPI for backlog device */ if (napi_schedule_prep(&queue->backlog)) { -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS if (cpu != smp_processor_id()) { struct rps_remote_softirq_cpus *rcpus = &__get_cpu_var(rps_remote_softirq_cpus); @@ -2376,7 +2376,7 @@ int netif_rx(struct sk_buff *skb) if (!skb->tstamp.tv64) net_timestamp(skb); -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS cpu = get_rps_cpu(skb->dev, skb); if (cpu < 0) cpu = smp_processor_id(); @@ -2750,7 +2750,7 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS int cpu; cpu = get_rps_cpu(skb->dev, skb); @@ -3189,7 +3189,7 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS /* * net_rps_action sends any pending IPI's for rps. This is only called from * softirq and interrupts must be enabled. @@ -3214,7 +3214,7 @@ static void net_rx_action(struct softirq_action *h) unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS int select; struct rps_remote_softirq_cpus *rcpus; #endif @@ -3280,7 +3280,7 @@ static void net_rx_action(struct softirq_action *h) netpoll_poll_unlock(have); } out: -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS rcpus = &__get_cpu_var(rps_remote_softirq_cpus); select = rcpus->select; rcpus->select ^= 1; @@ -5277,6 +5277,7 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; +#ifdef CONFIG_RPS if (!dev->num_rx_queues) { /* * Allocate a single RX queue if driver never called @@ -5293,7 +5294,7 @@ int register_netdevice(struct net_device *dev) atomic_set(&dev->_rx->count, 1); dev->num_rx_queues = 1; } - +#endif /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5653,11 +5654,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { struct netdev_queue *tx; - struct netdev_rx_queue *rx; struct net_device *dev; size_t alloc_size; struct net_device *p; +#ifdef CONFIG_RPS + struct netdev_rx_queue *rx; int i; +#endif BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5683,6 +5686,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } +#ifdef CONFIG_RPS rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); if (!rx) { printk(KERN_ERR "alloc_netdev: Unable to allocate " @@ -5698,6 +5702,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, */ for (i = 0; i < queue_count; i++) rx[i].first = rx; +#endif dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; @@ -5713,8 +5718,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; +#ifdef CONFIG_RPS dev->_rx = rx; dev->num_rx_queues = queue_count; +#endif dev->gso_max_size = GSO_MAX_SIZE; @@ -5731,8 +5738,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return dev; free_rx: +#ifdef CONFIG_RPS kfree(rx); free_tx: +#endif kfree(tx); free_p: kfree(p); @@ -6236,7 +6245,7 @@ static int __init net_dev_init(void) queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); -#ifdef CONFIG_SMP +#ifdef CONFIG_RPS queue->csd.func = trigger_softirq; queue->csd.info = queue; queue->csd.flags = 0; -- cgit v1.1 From 14a4b42bd6082b4ce3b94bad00cd367707cc1e97 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sat, 27 Mar 2010 16:35:50 -0700 Subject: net: fix unaligned access in IFLA_STATS64 Tony Luck observes that the original IFLA_STATS64 submission causes unaligned accesses. This is because nla_data() returns a pointer to a memory region that is only aligned to 32 bits. Do some memcpying to workaround this. Signed-off-by: Jan Engelhardt Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 62 ++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ffc6cf3..ed0766f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -602,36 +602,38 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->tx_compressed = b->tx_compressed; } -static void copy_rtnl_link_stats64(struct rtnl_link_stats64 *a, - const struct net_device_stats *b) +static void copy_rtnl_link_stats64(void *v, const struct net_device_stats *b) { - a->rx_packets = b->rx_packets; - a->tx_packets = b->tx_packets; - a->rx_bytes = b->rx_bytes; - a->tx_bytes = b->tx_bytes; - a->rx_errors = b->rx_errors; - a->tx_errors = b->tx_errors; - a->rx_dropped = b->rx_dropped; - a->tx_dropped = b->tx_dropped; - - a->multicast = b->multicast; - a->collisions = b->collisions; - - a->rx_length_errors = b->rx_length_errors; - a->rx_over_errors = b->rx_over_errors; - a->rx_crc_errors = b->rx_crc_errors; - a->rx_frame_errors = b->rx_frame_errors; - a->rx_fifo_errors = b->rx_fifo_errors; - a->rx_missed_errors = b->rx_missed_errors; - - a->tx_aborted_errors = b->tx_aborted_errors; - a->tx_carrier_errors = b->tx_carrier_errors; - a->tx_fifo_errors = b->tx_fifo_errors; - a->tx_heartbeat_errors = b->tx_heartbeat_errors; - a->tx_window_errors = b->tx_window_errors; - - a->rx_compressed = b->rx_compressed; - a->tx_compressed = b->tx_compressed; + struct rtnl_link_stats64 a; + + a.rx_packets = b->rx_packets; + a.tx_packets = b->tx_packets; + a.rx_bytes = b->rx_bytes; + a.tx_bytes = b->tx_bytes; + a.rx_errors = b->rx_errors; + a.tx_errors = b->tx_errors; + a.rx_dropped = b->rx_dropped; + a.tx_dropped = b->tx_dropped; + + a.multicast = b->multicast; + a.collisions = b->collisions; + + a.rx_length_errors = b->rx_length_errors; + a.rx_over_errors = b->rx_over_errors; + a.rx_crc_errors = b->rx_crc_errors; + a.rx_frame_errors = b->rx_frame_errors; + a.rx_fifo_errors = b->rx_fifo_errors; + a.rx_missed_errors = b->rx_missed_errors; + + a.tx_aborted_errors = b->tx_aborted_errors; + a.tx_carrier_errors = b->tx_carrier_errors; + a.tx_fifo_errors = b->tx_fifo_errors; + a.tx_heartbeat_errors = b->tx_heartbeat_errors; + a.tx_window_errors = b->tx_window_errors; + + a.rx_compressed = b->rx_compressed; + a.tx_compressed = b->tx_compressed; + memcpy(v, &a, sizeof(a)); } static inline int rtnl_vfinfo_size(const struct net_device *dev) @@ -734,8 +736,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, sizeof(struct rtnl_link_stats64)); if (attr == NULL) goto nla_put_failure; - - stats = dev_get_stats(dev); copy_rtnl_link_stats64(nla_data(attr), stats); if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { -- cgit v1.1 From adcfe1964e627b62fbc6e45609b1f0db2c64dd14 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sat, 27 Mar 2010 17:15:29 -0700 Subject: net: increase preallocated size of nlmsg to accomodate for IFLA_STATS64 When more data is stuffed into an nlmsg than initially projected, an extra allocation needs to be done. Reserve enough for IFLA_STATS64 so that this does not to needlessy happen. Signed-off-by: Jan Engelhardt Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ed0766f..bf919b6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -653,6 +653,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ -- cgit v1.1 From 10f744d205dde72a0016dbdb11e239da8269958b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Mar 2010 23:07:20 -0700 Subject: net: __netif_receive_skb should be static Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bcb3ed2..887aa84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2621,7 +2621,7 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } -int __netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; -- cgit v1.1 From 30bde1f5076a9b6bd4b6a168523930ce242c7449 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Mar 2010 01:00:44 -0700 Subject: rps: fix net-sysfs build for !CONFIG_RPS Signed-off-by: Stephen Rothwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f6b6bfe..1e7fdd6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -466,6 +466,7 @@ static struct attribute_group wireless_group = { }; #endif +#ifdef CONFIG_RPS /* * RX queue sysfs structures and functions. */ @@ -675,7 +676,7 @@ static void rx_queue_remove_kobjects(struct net_device *net) kobject_put(&net->_rx[i].kobj); kset_unregister(net->queues_kset); } - +#endif /* CONFIG_RPS */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG @@ -739,7 +740,7 @@ void netdev_unregister_kobject(struct net_device * net) if (!net_eq(dev_net(net), &init_net)) return; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_RPS rx_queue_remove_kobjects(net); #endif @@ -782,7 +783,7 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_RPS error = rx_queue_register_kobjects(net); if (error) { device_del(dev); -- cgit v1.1 From b00fabb4020d17bda4bea59507e09fadf573088d Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 29 Mar 2010 14:47:27 +0000 Subject: netdev: ethtool RXHASH flag This adds ethtool and device feature flag to allow control of receive hashing offload. Signed-off-by: Stephen Hemminger Acked-by: Jeff Garzik Signed-off-by: David S. Miller --- net/core/ethtool.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4cb6b6..73c81ed 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -121,7 +121,7 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data) * NETIF_F_xxx values in include/linux/netdevice.h */ static const u32 flags_dup_features = - (ETH_FLAG_LRO | ETH_FLAG_NTUPLE); + (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH); u32 ethtool_op_get_flags(struct net_device *dev) { @@ -152,6 +152,11 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data) features &= ~NETIF_F_NTUPLE; } + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; + else + features &= ~NETIF_F_RXHASH; + dev->features = features; return 0; } -- cgit v1.1 From 598ed9367a36ee1fd4ae3271a54a3547a33975a5 Mon Sep 17 00:00:00 2001 From: laurent chavey Date: Mon, 29 Mar 2010 10:41:36 +0000 Subject: fix net/core/dst.c coding style error and warnings Fix coding style errors and warnings output while running checkpatch.pl on the file net/core/dst.c. Signed-off-by: chavey Signed-off-by: David S. Miller --- net/core/dst.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index cb1b348..2076d84 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -43,7 +43,7 @@ static atomic_t dst_total = ATOMIC_INIT(0); */ static struct { spinlock_t lock; - struct dst_entry *list; + struct dst_entry *list; unsigned long timer_inc; unsigned long timer_expires; } dst_garbage = { @@ -51,7 +51,7 @@ static struct { .timer_inc = DST_GC_MAX, }; static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry * dst); +static void ___dst_free(struct dst_entry *dst); static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); @@ -135,8 +135,8 @@ loop: } expires = dst_garbage.timer_expires; /* - * if the next desired timer is more than 4 seconds in the future - * then round the timer to whole seconds + * if the next desired timer is more than 4 seconds in the + * future then round the timer to whole seconds */ if (expires > 4*HZ) expires = round_jiffies_relative(expires); @@ -151,7 +151,8 @@ loop: " expires: %lu elapsed: %lu us\n", atomic_read(&dst_total), delayed, work_performed, expires, - elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC); + elapsed.tv_sec * USEC_PER_SEC + + elapsed.tv_nsec / NSEC_PER_USEC); #endif } @@ -162,9 +163,9 @@ int dst_discard(struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard); -void * dst_alloc(struct dst_ops * ops) +void *dst_alloc(struct dst_ops *ops) { - struct dst_entry * dst; + struct dst_entry *dst; if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { if (ops->gc(ops)) @@ -184,19 +185,20 @@ void * dst_alloc(struct dst_ops * ops) atomic_inc(&ops->entries); return dst; } +EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry * dst) +static void ___dst_free(struct dst_entry *dst) { /* The first case (dev==NULL) is required, when protocol module is unloaded. */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) dst->input = dst->output = dst_discard; - } dst->obsolete = 2; } +EXPORT_SYMBOL(__dst_free); -void __dst_free(struct dst_entry * dst) +void __dst_free(struct dst_entry *dst) { spin_lock_bh(&dst_garbage.lock); ___dst_free(dst); @@ -261,15 +263,16 @@ again: } return NULL; } +EXPORT_SYMBOL(dst_destroy); void dst_release(struct dst_entry *dst) { if (dst) { - int newrefcnt; + int newrefcnt; smp_mb__before_atomic_dec(); - newrefcnt = atomic_dec_return(&dst->__refcnt); - WARN_ON(newrefcnt < 0); + newrefcnt = atomic_dec_return(&dst->__refcnt); + WARN_ON(newrefcnt < 0); } } EXPORT_SYMBOL(dst_release); @@ -305,7 +308,8 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, } } -static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +static int dst_dev_event(struct notifier_block *this, unsigned long event, + void *ptr) { struct net_device *dev = ptr; struct dst_entry *dst, *last = NULL; @@ -328,9 +332,8 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void last->next = dst; else dst_busy_list = dst; - for (; dst; dst = dst->next) { + for (; dst; dst = dst->next) dst_ifdown(dst, dev, event != NETDEV_DOWN); - } mutex_unlock(&dst_gc_mutex); break; } @@ -345,7 +348,3 @@ void __init dst_init(void) { register_netdevice_notifier(&dst_dev_notifier); } - -EXPORT_SYMBOL(__dst_free); -EXPORT_SYMBOL(dst_alloc); -EXPORT_SYMBOL(dst_destroy); -- cgit v1.1 From 152102c7f2bf191690f1069bae292ea3925adf14 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 30 Mar 2010 20:16:22 +0000 Subject: rps: keep the old behavior on SMP without rps keep the old behavior on SMP without rps RPS introduces a lock operation to per cpu variable input_pkt_queue on SMP whenever rps is enabled or not. On SMP without RPS, this lock isn't needed at all. Signed-off-by: Changli Gao ---- net/core/dev.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 887aa84..427cd53 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -206,6 +206,20 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } +static inline void rps_lock(struct softnet_data *queue) +{ +#ifdef CONFIG_RPS + spin_lock(&queue->input_pkt_queue.lock); +#endif +} + +static inline void rps_unlock(struct softnet_data *queue) +{ +#ifdef CONFIG_RPS + spin_unlock(&queue->input_pkt_queue.lock); +#endif +} + /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -2313,13 +2327,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu) local_irq_save(flags); __get_cpu_var(netdev_rx_stat).total++; - spin_lock(&queue->input_pkt_queue.lock); + rps_lock(queue); if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { enqueue: __skb_queue_tail(&queue->input_pkt_queue, skb); - spin_unlock_irqrestore(&queue->input_pkt_queue.lock, - flags); + rps_unlock(queue); + local_irq_restore(flags); return NET_RX_SUCCESS; } @@ -2341,7 +2355,7 @@ enqueue: goto enqueue; } - spin_unlock(&queue->input_pkt_queue.lock); + rps_unlock(queue); __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); @@ -2766,19 +2780,19 @@ int netif_receive_skb(struct sk_buff *skb) EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending */ -static void flush_backlog(struct net_device *dev, int cpu) +static void flush_backlog(void *arg) { - struct softnet_data *queue = &per_cpu(softnet_data, cpu); + struct net_device *dev = arg; + struct softnet_data *queue = &__get_cpu_var(softnet_data); struct sk_buff *skb, *tmp; - unsigned long flags; - spin_lock_irqsave(&queue->input_pkt_queue.lock, flags); + rps_lock(queue); skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) if (skb->dev == dev) { __skb_unlink(skb, &queue->input_pkt_queue); kfree_skb(skb); } - spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags); + rps_unlock(queue); } static int napi_gro_complete(struct sk_buff *skb) @@ -3091,14 +3105,16 @@ static int process_backlog(struct napi_struct *napi, int quota) do { struct sk_buff *skb; - spin_lock_irq(&queue->input_pkt_queue.lock); + local_irq_disable(); + rps_lock(queue); skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { __napi_complete(napi); spin_unlock_irq(&queue->input_pkt_queue.lock); break; } - spin_unlock_irq(&queue->input_pkt_queue.lock); + rps_unlock(queue); + local_irq_enable(); __netif_receive_skb(skb); } while (++work < quota && jiffies == start_time); @@ -5548,7 +5564,6 @@ void netdev_run_todo(void) while (!list_empty(&list)) { struct net_device *dev = list_first_entry(&list, struct net_device, todo_list); - int i; list_del(&dev->todo_list); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { @@ -5560,8 +5575,7 @@ void netdev_run_todo(void) dev->reg_state = NETREG_UNREGISTERED; - for_each_online_cpu(i) - flush_backlog(dev, i); + on_each_cpu(flush_backlog, dev, 1); netdev_wait_allrefs(dev); -- cgit v1.1 From d7997fe1f4584da12e9c29fb682c18e9bdc13b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Wed, 31 Mar 2010 00:17:06 +0000 Subject: flow: structurize flow cache Group all per-cpu data to one structure instead of having many globals. Also prepare the internals so that we can have multiple instances of the flow cache if needed. Only the kmem_cache is left as a global as all flow caches share the same element size, and benefit from using a common cache. Signed-off-by: Timo Teras Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/flow.c | 223 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 119 insertions(+), 104 deletions(-) (limited to 'net/core') diff --git a/net/core/flow.c b/net/core/flow.c index 9601587..1d27ca6 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -35,104 +35,105 @@ struct flow_cache_entry { atomic_t *object_ref; }; -atomic_t flow_cache_genid = ATOMIC_INIT(0); - -static u32 flow_hash_shift; -#define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; - -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) - -static struct kmem_cache *flow_cachep __read_mostly; - -static int flow_lwm, flow_hwm; - -struct flow_percpu_info { - int hash_rnd_recalc; - u32 hash_rnd; - int count; +struct flow_cache_percpu { + struct flow_cache_entry ** hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; + struct tasklet_struct flush_tasklet; }; -static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; - -#define flow_hash_rnd_recalc(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) -#define flow_hash_rnd(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd) -#define flow_count(cpu) \ - (per_cpu(flow_hash_info, cpu).count) - -static struct timer_list flow_hash_rnd_timer; - -#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) struct flow_flush_info { - atomic_t cpuleft; - struct completion completion; + struct flow_cache * cache; + atomic_t cpuleft; + struct completion completion; }; -static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; -#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) +struct flow_cache { + u32 hash_shift; + unsigned long order; + struct flow_cache_percpu * percpu; + struct notifier_block hotcpu_notifier; + int low_watermark; + int high_watermark; + struct timer_list rnd_timer; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); +static struct flow_cache flow_cache_global; +static struct kmem_cache *flow_cachep; + +#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) static void flow_cache_new_hashrnd(unsigned long arg) { + struct flow_cache *fc = (void *) arg; int i; for_each_possible_cpu(i) - flow_hash_rnd_recalc(i) = 1; + per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; - flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&flow_hash_rnd_timer); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); } -static void flow_entry_kill(int cpu, struct flow_cache_entry *fle) +static void flow_entry_kill(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + struct flow_cache_entry *fle) { if (fle->object) atomic_dec(fle->object_ref); kmem_cache_free(flow_cachep, fle); - flow_count(cpu)--; + fcp->hash_count--; } -static void __flow_cache_shrink(int cpu, int shrink_to) +static void __flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + int shrink_to) { struct flow_cache_entry *fle, **flp; int i; - for (i = 0; i < flow_hash_size; i++) { + for (i = 0; i < flow_cache_hash_size(fc); i++) { int k = 0; - flp = &flow_table(cpu)[i]; + flp = &fcp->hash_table[i]; while ((fle = *flp) != NULL && k < shrink_to) { k++; flp = &fle->next; } while ((fle = *flp) != NULL) { *flp = fle->next; - flow_entry_kill(cpu, fle); + flow_entry_kill(fc, fcp, fle); } } } -static void flow_cache_shrink(int cpu) +static void flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp) { - int shrink_to = flow_lwm / flow_hash_size; + int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); - __flow_cache_shrink(cpu, shrink_to); + __flow_cache_shrink(fc, fcp, shrink_to); } -static void flow_new_hash_rnd(int cpu) +static void flow_new_hash_rnd(struct flow_cache *fc, + struct flow_cache_percpu *fcp) { - get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); - flow_hash_rnd_recalc(cpu) = 0; - - __flow_cache_shrink(cpu, 0); + get_random_bytes(&fcp->hash_rnd, sizeof(u32)); + fcp->hash_rnd_recalc = 0; + __flow_cache_shrink(fc, fcp, 0); } -static u32 flow_hash_code(struct flowi *key, int cpu) +static u32 flow_hash_code(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + struct flowi *key) { u32 *k = (u32 *) key; - return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & - (flow_hash_size - 1)); + return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1)); } #if (BITS_PER_LONG == 64) @@ -168,24 +169,25 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { + struct flow_cache *fc = &flow_cache_global; + struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, **head; unsigned int hash; - int cpu; local_bh_disable(); - cpu = smp_processor_id(); + fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!fcp->hash_table) goto nocache; - if (flow_hash_rnd_recalc(cpu)) - flow_new_hash_rnd(cpu); - hash = flow_hash_code(key, cpu); + if (fcp->hash_rnd_recalc) + flow_new_hash_rnd(fc, fcp); + hash = flow_hash_code(fc, fcp, key); - head = &flow_table(cpu)[hash]; + head = &fcp->hash_table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -204,8 +206,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, } if (!fle) { - if (flow_count(cpu) > flow_hwm) - flow_cache_shrink(cpu); + if (fcp->hash_count > fc->high_watermark) + flow_cache_shrink(fc, fcp); fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); if (fle) { @@ -215,7 +217,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, fle->dir = dir; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; - flow_count(cpu)++; + fcp->hash_count++; } } @@ -249,14 +251,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache *fc = info->cache; + struct flow_cache_percpu *fcp; int i; - int cpu; - cpu = smp_processor_id(); - for (i = 0; i < flow_hash_size; i++) { + fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); + for (i = 0; i < flow_cache_hash_size(fc); i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = fcp->hash_table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -272,7 +275,6 @@ static void flow_cache_flush_tasklet(unsigned long data) complete(&info->completion); } -static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); static void flow_cache_flush_per_cpu(void *data) { struct flow_flush_info *info = data; @@ -280,8 +282,7 @@ static void flow_cache_flush_per_cpu(void *data) struct tasklet_struct *tasklet; cpu = smp_processor_id(); - - tasklet = flow_flush_tasklet(cpu); + tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; tasklet->data = (unsigned long)info; tasklet_schedule(tasklet); } @@ -294,6 +295,7 @@ void flow_cache_flush(void) /* Don't want cpus going down or up during this. */ get_online_cpus(); mutex_lock(&flow_flush_sem); + info.cache = &flow_cache_global; atomic_set(&info.cpuleft, num_online_cpus()); init_completion(&info.completion); @@ -307,62 +309,75 @@ void flow_cache_flush(void) put_online_cpus(); } -static void __init flow_cache_cpu_prepare(int cpu) +static void __init flow_cache_cpu_prepare(struct flow_cache *fc, + struct flow_cache_percpu *fcp) { - struct tasklet_struct *tasklet; - unsigned long order; - - for (order = 0; - (PAGE_SIZE << order) < - (sizeof(struct flow_cache_entry *)*flow_hash_size); - order++) - /* NOTHING */; - - flow_table(cpu) = (struct flow_cache_entry **) - __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (!flow_table(cpu)) - panic("NET: failed to allocate flow cache order %lu\n", order); - - flow_hash_rnd_recalc(cpu) = 1; - flow_count(cpu) = 0; - - tasklet = flow_flush_tasklet(cpu); - tasklet_init(tasklet, flow_cache_flush_tasklet, 0); + fcp->hash_table = (struct flow_cache_entry **) + __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); + if (!fcp->hash_table) + panic("NET: failed to allocate flow cache order %lu\n", fc->order); + + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); } static int flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) { + struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + int cpu = (unsigned long) hcpu; + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - __flow_cache_shrink((unsigned long)hcpu, 0); + __flow_cache_shrink(fc, fcp, 0); return NOTIFY_OK; } -static int __init flow_cache_init(void) +static int flow_cache_init(struct flow_cache *fc) { + unsigned long order; int i; - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, - NULL); - flow_hash_shift = 10; - flow_lwm = 2 * flow_hash_size; - flow_hwm = 4 * flow_hash_size; + fc->hash_shift = 10; + fc->low_watermark = 2 * flow_cache_hash_size(fc); + fc->high_watermark = 4 * flow_cache_hash_size(fc); + + for (order = 0; + (PAGE_SIZE << order) < + (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc)); + order++) + /* NOTHING */; + fc->order = order; + fc->percpu = alloc_percpu(struct flow_cache_percpu); - setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0); - flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&flow_hash_rnd_timer); + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, + (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); for_each_possible_cpu(i) - flow_cache_cpu_prepare(i); + flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i)); + + fc->hotcpu_notifier = (struct notifier_block){ + .notifier_call = flow_cache_cpu, + }; + register_hotcpu_notifier(&fc->hotcpu_notifier); - hotcpu_notifier(flow_cache_cpu, 0); return 0; } -module_init(flow_cache_init); +static int __init flow_cache_init_global(void) +{ + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_PANIC, NULL); + + return flow_cache_init(&flow_cache_global); +} + +module_init(flow_cache_init_global); EXPORT_SYMBOL(flow_cache_genid); EXPORT_SYMBOL(flow_cache_lookup); -- cgit v1.1 From 5acbbd428db47b12f137a8a2aa96b3c0a96b744e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 30 Mar 2010 22:35:50 +0000 Subject: net: change illegal_highdma to use dma_mask Robert Hancock pointed out two problems about NETIF_F_HIGHDMA: -Many drivers only set the flag when they detect they can use 64-bit DMA, since otherwise they could receive DMA addresses that they can't handle (which on platforms without IOMMU/SWIOTLB support is fatal). This means that if 64-bit support isn't available, even buffers located below 4GB will get copied unnecessarily. -Some drivers set the flag even though they can't actually handle 64-bit DMA, which would mean that on platforms without IOMMU/SWIOTLB they would get a DMA mapping error if the memory they received happened to be located above 4GB. http://lkml.org/lkml/2010/3/3/530 We can use the dma_mask if we need bouncing or not here. Then we can safely fix drivers that misuse NETIF_F_HIGHDMA. Signed-off-by: FUJITA Tomonori Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 427cd53..e19cdae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -1804,14 +1805,21 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + return 1; + } - if (dev->features & NETIF_F_HIGHDMA) - return 0; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) - return 1; + if (PCI_DMA_BUS_IS_PHYS) { + struct device *pdev = dev->dev.parent; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + return 1; + } + } #endif return 0; } -- cgit v1.1 From 9092c658bab215b2752fa59d2a36c05b74d1e9e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Apr 2010 13:34:49 -0700 Subject: net: illegal_highdma() fix Followup to commit 5acbbd428db47b12f137a8a2aa96b3c0a96b744e (net: change illegal_highdma to use dma_mask) If dev->dev.parent is NULL, we should not try to dereference it. Dont force inline illegal_highdma() as its pretty big now. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e19cdae..c6b5206 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1801,7 +1801,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -1814,6 +1814,8 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (PCI_DMA_BUS_IS_PHYS) { struct device *pdev = dev->dev.parent; + if (!pdev) + return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) -- cgit v1.1 From a748ee2426817a95b1f03012d8f339c45c722ae1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 1 Apr 2010 21:22:09 +0000 Subject: net: move address list functions to a separate file +little renaming of unicast functions to be smooth with multicast ones Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/Makefile | 3 +- net/core/dev.c | 430 +---------------------------------------- net/core/dev_addr_lists.c | 478 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 483 insertions(+), 428 deletions(-) create mode 100644 net/core/dev_addr_lists.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 08791ac..0a899f1 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -8,7 +8,8 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ - neighbour.o rtnetlink.o utils.o link_watch.o filter.o + neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ + dev_addr_lists.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/dev.c b/net/core/dev.c index c6b5206..949c62d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3968,314 +3968,6 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } -/* hw addresses list handling functions */ - -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - int alloc_size; - - if (addr_len > MAX_ADDR_LEN) - return -EINVAL; - - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - ha->type == addr_type) { - ha->refcount++; - return 0; - } - } - - - alloc_size = sizeof(*ha); - if (alloc_size < L1_CACHE_BYTES) - alloc_size = L1_CACHE_BYTES; - ha = kmalloc(alloc_size, GFP_ATOMIC); - if (!ha) - return -ENOMEM; - memcpy(ha->addr, addr, addr_len); - ha->type = addr_type; - ha->refcount = 1; - ha->synced = false; - list_add_tail_rcu(&ha->list, &list->list); - list->count++; - return 0; -} - -static void ha_rcu_free(struct rcu_head *head) -{ - struct netdev_hw_addr *ha; - - ha = container_of(head, struct netdev_hw_addr, rcu_head); - kfree(ha); -} - -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) { - if (--ha->refcount) - return 0; - list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); - list->count--; - return 0; - } - } - return -ENOENT; -} - -static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, - unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; - } - return 0; - -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; -} - -static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, - unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, addr_type); - } -} - -static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) -{ - int err = 0; - struct netdev_hw_addr *ha, *tmp; - - list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (!ha->synced) { - err = __hw_addr_add(to_list, ha->addr, - addr_len, ha->type); - if (err) - break; - ha->synced = true; - ha->refcount++; - } else if (ha->refcount == 1) { - __hw_addr_del(to_list, ha->addr, addr_len, ha->type); - __hw_addr_del(from_list, ha->addr, addr_len, ha->type); - } - } - return err; -} - -static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) -{ - struct netdev_hw_addr *ha, *tmp; - - list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (ha->synced) { - __hw_addr_del(to_list, ha->addr, - addr_len, ha->type); - ha->synced = false; - __hw_addr_del(from_list, ha->addr, - addr_len, ha->type); - } - } -} - -static void __hw_addr_flush(struct netdev_hw_addr_list *list) -{ - struct netdev_hw_addr *ha, *tmp; - - list_for_each_entry_safe(ha, tmp, &list->list, list) { - list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); - } - list->count = 0; -} - -static void __hw_addr_init(struct netdev_hw_addr_list *list) -{ - INIT_LIST_HEAD(&list->list); - list->count = 0; -} - -/* Device addresses handling functions */ - -static void dev_addr_flush(struct net_device *dev) -{ - /* rtnl_mutex must be held here */ - - __hw_addr_flush(&dev->dev_addrs); - dev->dev_addr = NULL; -} - -static int dev_addr_init(struct net_device *dev) -{ - unsigned char addr[MAX_ADDR_LEN]; - struct netdev_hw_addr *ha; - int err; - - /* rtnl_mutex must be held here */ - - __hw_addr_init(&dev->dev_addrs); - memset(addr, 0, sizeof(addr)); - err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), - NETDEV_HW_ADDR_T_LAN); - if (!err) { - /* - * Get the first (previously created) address from the list - * and set dev_addr pointer to this location. - */ - ha = list_first_entry(&dev->dev_addrs.list, - struct netdev_hw_addr, list); - dev->dev_addr = ha->addr; - } - return err; -} - -/** - * dev_addr_add - Add a device address - * @dev: device - * @addr: address to add - * @addr_type: address type - * - * Add a device address to the device or increase the reference count if - * it already exists. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add(struct net_device *dev, unsigned char *addr, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add); - -/** - * dev_addr_del - Release a device address. - * @dev: device - * @addr: address to delete - * @addr_type: address type - * - * Release reference to a device address and remove it from the device - * if the reference count drops to zero. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del(struct net_device *dev, unsigned char *addr, - unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha; - - ASSERT_RTNL(); - - /* - * We can not remove the first address from the list because - * dev->dev_addr points to that. - */ - ha = list_first_entry(&dev->dev_addrs.list, - struct netdev_hw_addr, list); - if (ha->addr == dev->dev_addr && ha->refcount == 1) - return -ENOENT; - - err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, - addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; -} -EXPORT_SYMBOL(dev_addr_del); - -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device by which addresses the addresses will be deleted - * @addr_type: address type - 0 means type will used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - /* multicast addresses handling functions */ int __dev_addr_delete(struct dev_addr_list **list, int *count, @@ -4336,57 +4028,6 @@ int __dev_addr_add(struct dev_addr_list **list, int *count, return 0; } -/** - * dev_unicast_delete - Release secondary unicast address. - * @dev: device - * @addr: address to delete - * - * Release reference to a secondary unicast address and remove it - * from the device if the reference count drops to zero. - * - * The caller must hold the rtnl_mutex. - */ -int dev_unicast_delete(struct net_device *dev, void *addr) -{ - int err; - - ASSERT_RTNL(); - - netif_addr_lock_bh(dev); - err = __hw_addr_del(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST); - if (!err) - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); - return err; -} -EXPORT_SYMBOL(dev_unicast_delete); - -/** - * dev_unicast_add - add a secondary unicast address - * @dev: device - * @addr: address to add - * - * Add a secondary unicast address to the device or increase - * the reference count if it already exists. - * - * The caller must hold the rtnl_mutex. - */ -int dev_unicast_add(struct net_device *dev, void *addr) -{ - int err; - - ASSERT_RTNL(); - - netif_addr_lock_bh(dev); - err = __hw_addr_add(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST); - if (!err) - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); - return err; -} -EXPORT_SYMBOL(dev_unicast_add); int __dev_addr_sync(struct dev_addr_list **to, int *to_count, struct dev_addr_list **from, int *from_count) @@ -4436,71 +4077,6 @@ void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, } EXPORT_SYMBOL_GPL(__dev_addr_unsync); -/** - * dev_unicast_sync - Synchronize device's unicast list to another device - * @to: destination device - * @from: source device - * - * Add newly added addresses to the destination device and release - * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. - * - * This function is intended to be called from the dev->set_rx_mode - * function of layered software devices. - */ -int dev_unicast_sync(struct net_device *to, struct net_device *from) -{ - int err = 0; - - if (to->addr_len != from->addr_len) - return -EINVAL; - - netif_addr_lock_bh(to); - err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); - return err; -} -EXPORT_SYMBOL(dev_unicast_sync); - -/** - * dev_unicast_unsync - Remove synchronized addresses from the destination device - * @to: destination device - * @from: source device - * - * Remove all addresses that were added to the destination device by - * dev_unicast_sync(). This function is intended to be called from the - * dev->stop function of layered software devices. - */ -void dev_unicast_unsync(struct net_device *to, struct net_device *from) -{ - if (to->addr_len != from->addr_len) - return; - - netif_addr_lock_bh(from); - netif_addr_lock(to); - __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); - __dev_set_rx_mode(to); - netif_addr_unlock(to); - netif_addr_unlock_bh(from); -} -EXPORT_SYMBOL(dev_unicast_unsync); - -void dev_unicast_flush(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - __hw_addr_flush(&dev->uc); - netif_addr_unlock_bh(dev); -} -EXPORT_SYMBOL(dev_unicast_flush); - -static void dev_unicast_init(struct net_device *dev) -{ - __hw_addr_init(&dev->uc); -} - - static void __dev_addr_discard(struct dev_addr_list **list) { struct dev_addr_list *tmp; @@ -5153,7 +4729,7 @@ static void rollback_registered_many(struct list_head *head) /* * Flush the unicast and multicast chains */ - dev_unicast_flush(dev); + dev_uc_flush(dev); dev_addr_discard(dev); if (dev->netdev_ops->ndo_uninit) @@ -5734,7 +5310,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (dev_addr_init(dev)) goto free_rx; - dev_unicast_init(dev); + dev_uc_init(dev); dev_net_set(dev, &init_net); @@ -5968,7 +5544,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* * Flush the unicast and multicast chains */ - dev_unicast_flush(dev); + dev_uc_flush(dev); dev_addr_discard(dev); netdev_unregister_kobject(dev); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c new file mode 100644 index 0000000..7e52b6d --- /dev/null +++ b/net/core/dev_addr_lists.c @@ -0,0 +1,478 @@ +/* + * net/core/dev_addr_lists.c - Functions for handling net device lists + * Copyright (c) 2010 Jiri Pirko + * + * This file contains functions for working with unicast, multicast and device + * addresses lists. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include + +/* + * General list handling functions + */ + +static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + ha->type == addr_type) { + ha->refcount++; + return 0; + } + } + + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + ha->refcount = 1; + ha->synced = false; + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + return 0; +} + +static void ha_rcu_free(struct rcu_head *head) +{ + struct netdev_hw_addr *ha; + + ha = container_of(head, struct netdev_hw_addr, rcu_head); + kfree(ha); +} + +static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + if (--ha->refcount) + return 0; + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + list->count--; + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, + unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, &from_list->list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del(to_list, ha2->addr, addr_len, type); + } + return err; +} + +static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, + unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del(to_list, ha->addr, addr_len, addr_type); + } +} + +static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (!ha->synced) { + err = __hw_addr_add(to_list, ha->addr, + addr_len, ha->type); + if (err) + break; + ha->synced = true; + ha->refcount++; + } else if (ha->refcount == 1) { + __hw_addr_del(to_list, ha->addr, addr_len, ha->type); + __hw_addr_del(from_list, ha->addr, addr_len, ha->type); + } + } + return err; +} + +static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->synced) { + __hw_addr_del(to_list, ha->addr, + addr_len, ha->type); + ha->synced = false; + __hw_addr_del(from_list, ha->addr, + addr_len, ha->type); + } + } +} + +static void __hw_addr_flush(struct netdev_hw_addr_list *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + } + list->count = 0; +} + +static void __hw_addr_init(struct netdev_hw_addr_list *list) +{ + INIT_LIST_HEAD(&list->list); + list->count = 0; +} + +/* + * Device addresses handling functions + */ + +/** + * dev_addr_flush - Flush device address list + * @dev: device + * + * Flush device address list and reset ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addrs); + dev->dev_addr = NULL; +} +EXPORT_SYMBOL(dev_addr_flush); + +/** + * dev_addr_init - Init device address list + * @dev: device + * + * Init device address list and create the first element, + * used by ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + __hw_addr_init(&dev->dev_addrs); + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} +EXPORT_SYMBOL(dev_addr_init); + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha; + + ASSERT_RTNL(); + + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + if (ha->addr == dev->dev_addr && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device by which addresses the addresses will be deleted + * @addr_type: address type - 0 means type will used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* + * Unicast list handling functions + */ + +/** + * dev_uc_add - Add a secondary unicast address + * @dev: device + * @addr: address to add + * + * Add a secondary unicast address to the device or increase + * the reference count if it already exists. + */ +int dev_uc_add(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_add); + +/** + * dev_uc_del - Release secondary unicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a secondary unicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_uc_del(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_del); + +/** + * dev_uc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. + */ +int dev_uc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_bh(to); + err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync); + +/** + * dev_uc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_uc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_uc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_uc_unsync); + +/** + * dev_uc_flush - Flush unicast addresses + * @dev: device + * + * Flush unicast addresses. + */ +void dev_uc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->uc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_uc_flush); + +/** + * dev_uc_flush - Init unicast address list + * @dev: device + * + * Init unicast address list. + */ +void dev_uc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->uc); +} +EXPORT_SYMBOL(dev_uc_init); + +/* + * Multicast list handling functions + */ + +/* To be filled here */ -- cgit v1.1 From 22bedad3ce112d5ca1eaf043d4990fa2ed698c87 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 1 Apr 2010 21:22:57 +0000 Subject: net: convert multicast list to list_head Converts the list and the core manipulating with it to be the same as uc_list. +uses two functions for adding/removing mc address (normal and "global" variant) instead of a function parameter. +removes dev_mcast.c completely. +exposes netdev_hw_addr_list_* macros along with __hw_addr_* functions for manipulation with lists on a sandbox (used in bonding and 80211 drivers) Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/Makefile | 5 +- net/core/dev.c | 145 +--------------------- net/core/dev_addr_lists.c | 305 ++++++++++++++++++++++++++++++++++++++++++---- net/core/dev_mcast.c | 232 ----------------------------------- 4 files changed, 291 insertions(+), 396 deletions(-) delete mode 100644 net/core/dev_mcast.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 0a899f1..51c3eec 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -7,9 +7,8 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ - neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - dev_addr_lists.o +obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ + neighbour.o rtnetlink.o utils.o link_watch.o filter.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/dev.c b/net/core/dev.c index 949c62d..2a9b7dd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3968,140 +3968,6 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } -/* multicast addresses handling functions */ - -int __dev_addr_delete(struct dev_addr_list **list, int *count, - void *addr, int alen, int glbl) -{ - struct dev_addr_list *da; - - for (; (da = *list) != NULL; list = &da->next) { - if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && - alen == da->da_addrlen) { - if (glbl) { - int old_glbl = da->da_gusers; - da->da_gusers = 0; - if (old_glbl == 0) - break; - } - if (--da->da_users) - return 0; - - *list = da->next; - kfree(da); - (*count)--; - return 0; - } - } - return -ENOENT; -} - -int __dev_addr_add(struct dev_addr_list **list, int *count, - void *addr, int alen, int glbl) -{ - struct dev_addr_list *da; - - for (da = *list; da != NULL; da = da->next) { - if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && - da->da_addrlen == alen) { - if (glbl) { - int old_glbl = da->da_gusers; - da->da_gusers = 1; - if (old_glbl) - return 0; - } - da->da_users++; - return 0; - } - } - - da = kzalloc(sizeof(*da), GFP_ATOMIC); - if (da == NULL) - return -ENOMEM; - memcpy(da->da_addr, addr, alen); - da->da_addrlen = alen; - da->da_users = 1; - da->da_gusers = glbl ? 1 : 0; - da->next = *list; - *list = da; - (*count)++; - return 0; -} - - -int __dev_addr_sync(struct dev_addr_list **to, int *to_count, - struct dev_addr_list **from, int *from_count) -{ - struct dev_addr_list *da, *next; - int err = 0; - - da = *from; - while (da != NULL) { - next = da->next; - if (!da->da_synced) { - err = __dev_addr_add(to, to_count, - da->da_addr, da->da_addrlen, 0); - if (err < 0) - break; - da->da_synced = 1; - da->da_users++; - } else if (da->da_users == 1) { - __dev_addr_delete(to, to_count, - da->da_addr, da->da_addrlen, 0); - __dev_addr_delete(from, from_count, - da->da_addr, da->da_addrlen, 0); - } - da = next; - } - return err; -} -EXPORT_SYMBOL_GPL(__dev_addr_sync); - -void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, - struct dev_addr_list **from, int *from_count) -{ - struct dev_addr_list *da, *next; - - da = *from; - while (da != NULL) { - next = da->next; - if (da->da_synced) { - __dev_addr_delete(to, to_count, - da->da_addr, da->da_addrlen, 0); - da->da_synced = 0; - __dev_addr_delete(from, from_count, - da->da_addr, da->da_addrlen, 0); - } - da = next; - } -} -EXPORT_SYMBOL_GPL(__dev_addr_unsync); - -static void __dev_addr_discard(struct dev_addr_list **list) -{ - struct dev_addr_list *tmp; - - while (*list != NULL) { - tmp = *list; - *list = tmp->next; - if (tmp->da_users > tmp->da_gusers) - printk("__dev_addr_discard: address leakage! " - "da_users=%d\n", tmp->da_users); - kfree(tmp); - } -} - -void dev_addr_discard(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - - __dev_addr_discard(&dev->mc_list); - netdev_mc_count(dev) = 0; - - netif_addr_unlock_bh(dev); -} -EXPORT_SYMBOL(dev_addr_discard); - /** * dev_get_flags - get flags reported to userspace * @dev: device @@ -4412,8 +4278,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCDELMULTI: if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || @@ -4421,8 +4286,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) @@ -4730,7 +4594,7 @@ static void rollback_registered_many(struct list_head *head) * Flush the unicast and multicast chains */ dev_uc_flush(dev); - dev_addr_discard(dev); + dev_mc_flush(dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -5310,6 +5174,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (dev_addr_init(dev)) goto free_rx; + dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &init_net); @@ -5545,7 +5410,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * Flush the unicast and multicast chains */ dev_uc_flush(dev); - dev_addr_discard(dev); + dev_mc_flush(dev); netdev_unregister_kobject(dev); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 7e52b6d..37d5975 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -19,8 +19,9 @@ * General list handling functions */ -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; int alloc_size; @@ -31,6 +32,13 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, list_for_each_entry(ha, &list->list, list) { if (!memcmp(ha->addr, addr, addr_len) && ha->type == addr_type) { + if (global) { + /* check if addr is already used as global */ + if (ha->global_use) + return 0; + else + ha->global_use = true; + } ha->refcount++; return 0; } @@ -46,12 +54,19 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; + ha->global_use = global; ha->synced = false; list_add_tail_rcu(&ha->list, &list->list); list->count++; return 0; } +static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); +} + static void ha_rcu_free(struct rcu_head *head) { struct netdev_hw_addr *ha; @@ -60,14 +75,21 @@ static void ha_rcu_free(struct rcu_head *head) kfree(ha); } -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; list_for_each_entry(ha, &list->list, list) { if (!memcmp(ha->addr, addr, addr_len) && (ha->type == addr_type || !addr_type)) { + if (global) { + if (!ha->global_use) + break; + else + ha->global_use = false; + } if (--ha->refcount) return 0; list_del_rcu(&ha->list); @@ -79,10 +101,15 @@ static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, return -ENOENT; } -static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, - unsigned char addr_type) +static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); +} + +int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) { int err; struct netdev_hw_addr *ha, *ha2; @@ -105,11 +132,11 @@ unroll: } return err; } +EXPORT_SYMBOL(__hw_addr_add_multiple); -static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, - unsigned char addr_type) +void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) { struct netdev_hw_addr *ha; unsigned char type; @@ -119,10 +146,11 @@ static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, __hw_addr_del(to_list, ha->addr, addr_len, addr_type); } } +EXPORT_SYMBOL(__hw_addr_del_multiple); -static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) +int __hw_addr_sync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; @@ -142,10 +170,11 @@ static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, } return err; } +EXPORT_SYMBOL(__hw_addr_sync); -static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) +void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { struct netdev_hw_addr *ha, *tmp; @@ -159,8 +188,9 @@ static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, } } } +EXPORT_SYMBOL(__hw_addr_unsync); -static void __hw_addr_flush(struct netdev_hw_addr_list *list) +void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; @@ -170,12 +200,14 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list) } list->count = 0; } +EXPORT_SYMBOL(__hw_addr_flush); -static void __hw_addr_init(struct netdev_hw_addr_list *list) +void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; } +EXPORT_SYMBOL(__hw_addr_init); /* * Device addresses handling functions @@ -475,4 +507,235 @@ EXPORT_SYMBOL(dev_uc_init); * Multicast list handling functions */ -/* To be filled here */ +static int __dev_mc_add(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +/** + * dev_mc_add - Add a multicast address + * @dev: device + * @addr: address to add + * + * Add a multicast address to the device or increase + * the reference count if it already exists. + */ +int dev_mc_add(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_add); + +/** + * dev_mc_add_global - Add a global multicast address + * @dev: device + * @addr: address to add + * + * Add a global multicast address to the device. + */ +int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_add_global); + +static int __dev_mc_del(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} + +/** + * dev_mc_del - Delete a multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_del); + +/** + * dev_mc_del_global - Delete a global multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_del_global); + +/** + * dev_mc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. + * + * This function is intended to be called from the dev->set_multicast_list + * or dev->set_rx_mode function of layered software devices. + */ +int dev_mc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_bh(to); + err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync); + +/** + * dev_mc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_mc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_mc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_mc_unsync); + +/** + * dev_mc_flush - Flush multicast addresses + * @dev: device + * + * Flush multicast addresses. + */ +void dev_mc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->mc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_mc_flush); + +/** + * dev_mc_flush - Init multicast address list + * @dev: device + * + * Init multicast address list. + */ +void dev_mc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->mc); +} +EXPORT_SYMBOL(dev_mc_init); + +#ifdef CONFIG_PROC_FS +#include +#include + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, ha->refcount, ha->global_use); + + for (i = 0; i < dev->addr_len; i++) + seq_printf(seq, "%02x", ha->addr[i]); + + seq_putc(seq, '\n'); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + proc_net_remove(net, "dev_mcast"); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +void __init dev_mcast_init(void) +{ + register_pernet_subsys(&dev_mc_net_ops); +} + diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c deleted file mode 100644 index 3dc295b..0000000 --- a/net/core/dev_mcast.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Linux NET3: Multicast List maintenance. - * - * Authors: - * Tim Kordas - * Richard Underwood - * - * Stir fried together from the IP multicast and CAP patches above - * Alan Cox - * - * Fixes: - * Alan Cox : Update the device on a real delete - * rather than any time but... - * Alan Cox : IFF_ALLMULTI support. - * Alan Cox : New format set_multicast_list() calls. - * Gleb Natapov : Remove dev_mc_lock. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * Device multicast list maintenance. - * - * This is used both by IP and by the user level maintenance functions. - * Unlike BSD we maintain a usage count on a given multicast address so - * that a casual user application can add/delete multicasts used by - * protocols without doing damage to the protocols when it deletes the - * entries. It also helps IP as it tracks overlapping maps. - * - * Device mc lists are changed by bh at least if IPv6 is enabled, - * so that it must be bh protected. - * - * We block accesses to device mc filters with netif_tx_lock. - */ - -/* - * Delete a device level multicast - */ - -int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) -{ - int err; - - netif_addr_lock_bh(dev); - err = __dev_addr_delete(&dev->mc_list, &dev->mc_count, - addr, alen, glbl); - if (!err) { - /* - * We have altered the list, so the card - * loaded filter is now wrong. Fix it - */ - - __dev_set_rx_mode(dev); - } - netif_addr_unlock_bh(dev); - return err; -} - -/* - * Add a device level multicast - */ - -int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) -{ - int err; - - netif_addr_lock_bh(dev); - if (alen != dev->addr_len) - err = -EINVAL; - else - err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); - if (!err) - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); - return err; -} - -/** - * dev_mc_sync - Synchronize device's multicast list to another device - * @to: destination device - * @from: source device - * - * Add newly added addresses to the destination device and release - * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. - * - * This function is intended to be called from the dev->set_multicast_list - * or dev->set_rx_mode function of layered software devices. - */ -int dev_mc_sync(struct net_device *to, struct net_device *from) -{ - int err = 0; - - netif_addr_lock_bh(to); - err = __dev_addr_sync(&to->mc_list, &to->mc_count, - &from->mc_list, &from->mc_count); - if (!err) - __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); - - return err; -} -EXPORT_SYMBOL(dev_mc_sync); - - -/** - * dev_mc_unsync - Remove synchronized addresses from the destination - * device - * @to: destination device - * @from: source device - * - * Remove all addresses that were added to the destination device by - * dev_mc_sync(). This function is intended to be called from the - * dev->stop function of layered software devices. - */ -void dev_mc_unsync(struct net_device *to, struct net_device *from) -{ - netif_addr_lock_bh(from); - netif_addr_lock(to); - - __dev_addr_unsync(&to->mc_list, &to->mc_count, - &from->mc_list, &from->mc_count); - __dev_set_rx_mode(to); - - netif_addr_unlock(to); - netif_addr_unlock_bh(from); -} -EXPORT_SYMBOL(dev_mc_unsync); - -#ifdef CONFIG_PROC_FS -static int dev_mc_seq_show(struct seq_file *seq, void *v) -{ - struct dev_addr_list *m; - struct net_device *dev = v; - - if (v == SEQ_START_TOKEN) - return 0; - - netif_addr_lock_bh(dev); - for (m = dev->mc_list; m; m = m->next) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, m->dmi_users, m->dmi_gusers); - - for (i = 0; i < m->dmi_addrlen; i++) - seq_printf(seq, "%02x", m->dmi_addr[i]); - - seq_putc(seq, '\n'); - } - netif_addr_unlock_bh(dev); - return 0; -} - -static const struct seq_operations dev_mc_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_mc_seq_show, -}; - -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, - .open = dev_mc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif - -static int __net_init dev_mc_net_init(struct net *net) -{ - if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) - return -ENOMEM; - return 0; -} - -static void __net_exit dev_mc_net_exit(struct net *net) -{ - proc_net_remove(net, "dev_mcast"); -} - -static struct pernet_operations __net_initdata dev_mc_net_ops = { - .init = dev_mc_net_init, - .exit = dev_mc_net_exit, -}; - -void __init dev_mcast_init(void) -{ - register_pernet_subsys(&dev_mc_net_ops); -} - -EXPORT_SYMBOL(dev_mc_add); -EXPORT_SYMBOL(dev_mc_delete); -- cgit v1.1 From 5a6d234e73d7d021c74e1aa349b3b37b81372c66 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 5 Apr 2010 14:37:19 -0700 Subject: rps: fixed missed rps_unlock Fix spin_unlock_irq which needs to be rps_unlock. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2a9b7dd..74f77ca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3120,7 +3120,7 @@ static int process_backlog(struct napi_struct *napi, int quota) skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { __napi_complete(napi); - spin_unlock_irq(&queue->input_pkt_queue.lock); + rps_unlock(queue); break; } rps_unlock(queue); -- cgit v1.1 From e4008276fddd10445ff06707694a938cb7f35ed4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Apr 2010 15:42:39 -0700 Subject: net: Add a missing local_irq_enable() As noticed by Changli Gao, we must call local_irq_enable() after rps_unlock() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 74f77ca..b98ddc6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3121,6 +3121,7 @@ static int process_backlog(struct napi_struct *napi, int quota) if (!skb) { __napi_complete(napi); rps_unlock(queue); + local_irq_enable(); break; } rps_unlock(queue); -- cgit v1.1 From fe1a5f031e76bd8761a7803d75b95ee96e84a574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Wed, 7 Apr 2010 00:30:04 +0000 Subject: flow: virtualize flow cache entry methods This allows to validate the cached object before returning it. It also allows to destruct object properly, if the last reference was held in flow cache. This is also a prepartion for caching bundles in the flow cache. In return for virtualizing the methods, we save on: - not having to regenerate the whole flow cache on policy removal: each flow matching a killed policy gets refreshed as the getter function notices it smartly. - we do not have to call flow_cache_flush from policy gc, since the flow cache now properly deletes the object if it had any references Signed-off-by: Timo Teras Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/flow.c | 128 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 60 deletions(-) (limited to 'net/core') diff --git a/net/core/flow.c b/net/core/flow.c index 1d27ca6..521df52 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -26,17 +26,16 @@ #include struct flow_cache_entry { - struct flow_cache_entry *next; - u16 family; - u8 dir; - u32 genid; - struct flowi key; - void *object; - atomic_t *object_ref; + struct flow_cache_entry *next; + u16 family; + u8 dir; + u32 genid; + struct flowi key; + struct flow_cache_object *object; }; struct flow_cache_percpu { - struct flow_cache_entry ** hash_table; + struct flow_cache_entry **hash_table; int hash_count; u32 hash_rnd; int hash_rnd_recalc; @@ -44,7 +43,7 @@ struct flow_cache_percpu { }; struct flow_flush_info { - struct flow_cache * cache; + struct flow_cache *cache; atomic_t cpuleft; struct completion completion; }; @@ -52,7 +51,7 @@ struct flow_flush_info { struct flow_cache { u32 hash_shift; unsigned long order; - struct flow_cache_percpu * percpu; + struct flow_cache_percpu *percpu; struct notifier_block hotcpu_notifier; int low_watermark; int high_watermark; @@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg) add_timer(&fc->rnd_timer); } +static int flow_entry_valid(struct flow_cache_entry *fle) +{ + if (atomic_read(&flow_cache_genid) != fle->genid) + return 0; + if (fle->object && !fle->object->ops->check(fle->object)) + return 0; + return 1; +} + static void flow_entry_kill(struct flow_cache *fc, struct flow_cache_percpu *fcp, struct flow_cache_entry *fle) { if (fle->object) - atomic_dec(fle->object_ref); + fle->object->ops->delete(fle->object); kmem_cache_free(flow_cachep, fle); fcp->hash_count--; } @@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc, int i; for (i = 0; i < flow_cache_hash_size(fc); i++) { - int k = 0; + int saved = 0; flp = &fcp->hash_table[i]; - while ((fle = *flp) != NULL && k < shrink_to) { - k++; - flp = &fle->next; - } while ((fle = *flp) != NULL) { - *flp = fle->next; - flow_entry_kill(fc, fcp, fle); + if (saved < shrink_to && + flow_entry_valid(fle)) { + saved++; + flp = &fle->next; + } else { + *flp = fle->next; + flow_entry_kill(fc, fcp, fle); + } } } } @@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver) +struct flow_cache_object * +flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver, void *ctx) { struct flow_cache *fc = &flow_cache_global; struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, **head; + struct flow_cache_object *flo; unsigned int hash; local_bh_disable(); fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); fle = NULL; + flo = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ if (!fcp->hash_table) @@ -185,27 +198,17 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, if (fcp->hash_rnd_recalc) flow_new_hash_rnd(fc, fcp); - hash = flow_hash_code(fc, fcp, key); + hash = flow_hash_code(fc, fcp, key); head = &fcp->hash_table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && - flow_key_compare(key, &fle->key) == 0) { - if (fle->genid == atomic_read(&flow_cache_genid)) { - void *ret = fle->object; - - if (ret) - atomic_inc(fle->object_ref); - local_bh_enable(); - - return ret; - } + flow_key_compare(key, &fle->key) == 0) break; - } } - if (!fle) { + if (unlikely(!fle)) { if (fcp->hash_count > fc->high_watermark) flow_cache_shrink(fc, fcp); @@ -219,33 +222,39 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, fle->object = NULL; fcp->hash_count++; } + } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { + flo = fle->object; + if (!flo) + goto ret_object; + flo = flo->ops->get(flo); + if (flo) + goto ret_object; + } else if (fle->object) { + flo = fle->object; + flo->ops->delete(flo); + fle->object = NULL; } nocache: - { - int err; - void *obj; - atomic_t *obj_ref; - - err = resolver(net, key, family, dir, &obj, &obj_ref); - - if (fle && !err) { - fle->genid = atomic_read(&flow_cache_genid); - - if (fle->object) - atomic_dec(fle->object_ref); - - fle->object = obj; - fle->object_ref = obj_ref; - if (obj) - atomic_inc(fle->object_ref); - } - local_bh_enable(); - - if (err) - obj = ERR_PTR(err); - return obj; + flo = NULL; + if (fle) { + flo = fle->object; + fle->object = NULL; + } + flo = resolver(net, key, family, dir, flo, ctx); + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + if (!IS_ERR(flo)) + fle->object = flo; + else + fle->genid--; + } else { + if (flo && !IS_ERR(flo)) + flo->ops->delete(flo); } +ret_object: + local_bh_enable(); + return flo; } static void flow_cache_flush_tasklet(unsigned long data) @@ -261,13 +270,12 @@ static void flow_cache_flush_tasklet(unsigned long data) fle = fcp->hash_table[i]; for (; fle; fle = fle->next) { - unsigned genid = atomic_read(&flow_cache_genid); - - if (!fle->object || fle->genid == genid) + if (flow_entry_valid(fle)) continue; + if (fle->object) + fle->object->ops->delete(fle->object); fle->object = NULL; - atomic_dec(fle->object_ref); } } -- cgit v1.1 From 8e4795605d1e1b39113818ad7c147b8a867a1f6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Wed, 7 Apr 2010 00:30:07 +0000 Subject: flow: delayed deletion of flow cache entries Speed up lookups by freeing flow cache entries later. After virtualizing flow cache entry operations, the flow cache may now end up calling policy or bundle destructor which can be slowish. As gc_list is more effective with double linked list, the flow cache is converted to use common hlist and list macroes where appropriate. Signed-off-by: Timo Teras Signed-off-by: David S. Miller --- net/core/flow.c | 100 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/flow.c b/net/core/flow.c index 521df52..1619006 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -26,7 +26,10 @@ #include struct flow_cache_entry { - struct flow_cache_entry *next; + union { + struct hlist_node hlist; + struct list_head gc_list; + } u; u16 family; u8 dir; u32 genid; @@ -35,7 +38,7 @@ struct flow_cache_entry { }; struct flow_cache_percpu { - struct flow_cache_entry **hash_table; + struct hlist_head *hash_table; int hash_count; u32 hash_rnd; int hash_rnd_recalc; @@ -62,6 +65,9 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0); static struct flow_cache flow_cache_global; static struct kmem_cache *flow_cachep; +static DEFINE_SPINLOCK(flow_cache_gc_lock); +static LIST_HEAD(flow_cache_gc_list); + #define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) #define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) @@ -86,38 +92,66 @@ static int flow_entry_valid(struct flow_cache_entry *fle) return 1; } -static void flow_entry_kill(struct flow_cache *fc, - struct flow_cache_percpu *fcp, - struct flow_cache_entry *fle) +static void flow_entry_kill(struct flow_cache_entry *fle) { if (fle->object) fle->object->ops->delete(fle->object); kmem_cache_free(flow_cachep, fle); - fcp->hash_count--; +} + +static void flow_cache_gc_task(struct work_struct *work) +{ + struct list_head gc_list; + struct flow_cache_entry *fce, *n; + + INIT_LIST_HEAD(&gc_list); + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail_init(&flow_cache_gc_list, &gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + flow_entry_kill(fce); +} +static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); + +static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, + int deleted, struct list_head *gc_list) +{ + if (deleted) { + fcp->hash_count -= deleted; + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail(gc_list, &flow_cache_gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + schedule_work(&flow_cache_gc_work); + } } static void __flow_cache_shrink(struct flow_cache *fc, struct flow_cache_percpu *fcp, int shrink_to) { - struct flow_cache_entry *fle, **flp; - int i; + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; for (i = 0; i < flow_cache_hash_size(fc); i++) { int saved = 0; - flp = &fcp->hash_table[i]; - while ((fle = *flp) != NULL) { + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { if (saved < shrink_to && flow_entry_valid(fle)) { saved++; - flp = &fle->next; } else { - *flp = fle->next; - flow_entry_kill(fc, fcp, fle); + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); } } } + + flow_cache_queue_garbage(fcp, deleted, &gc_list); } static void flow_cache_shrink(struct flow_cache *fc, @@ -182,7 +216,8 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, { struct flow_cache *fc = &flow_cache_global; struct flow_cache_percpu *fcp; - struct flow_cache_entry *fle, **head; + struct flow_cache_entry *fle, *tfle; + struct hlist_node *entry; struct flow_cache_object *flo; unsigned int hash; @@ -200,12 +235,13 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, flow_new_hash_rnd(fc, fcp); hash = flow_hash_code(fc, fcp, key); - head = &fcp->hash_table[hash]; - for (fle = *head; fle; fle = fle->next) { - if (fle->family == family && - fle->dir == dir && - flow_key_compare(key, &fle->key) == 0) + hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { + if (tfle->family == family && + tfle->dir == dir && + flow_key_compare(key, &tfle->key) == 0) { + fle = tfle; break; + } } if (unlikely(!fle)) { @@ -214,12 +250,11 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); if (fle) { - fle->next = *head; - *head = fle; fle->family = family; fle->dir = dir; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; + hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); fcp->hash_count++; } } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { @@ -262,23 +297,26 @@ static void flow_cache_flush_tasklet(unsigned long data) struct flow_flush_info *info = (void *)data; struct flow_cache *fc = info->cache; struct flow_cache_percpu *fcp; - int i; + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); for (i = 0; i < flow_cache_hash_size(fc); i++) { - struct flow_cache_entry *fle; - - fle = fcp->hash_table[i]; - for (; fle; fle = fle->next) { + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { if (flow_entry_valid(fle)) continue; - if (fle->object) - fle->object->ops->delete(fle->object); - fle->object = NULL; + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); } } + flow_cache_queue_garbage(fcp, deleted, &gc_list); + if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); } @@ -320,7 +358,7 @@ void flow_cache_flush(void) static void __init flow_cache_cpu_prepare(struct flow_cache *fc, struct flow_cache_percpu *fcp) { - fcp->hash_table = (struct flow_cache_entry **) + fcp->hash_table = (struct hlist_head *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); if (!fcp->hash_table) panic("NET: failed to allocate flow cache order %lu\n", fc->order); @@ -354,7 +392,7 @@ static int flow_cache_init(struct flow_cache *fc) for (order = 0; (PAGE_SIZE << order) < - (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc)); + (sizeof(struct hlist_head)*flow_cache_hash_size(fc)); order++) /* NOTHING */; fc->order = order; -- cgit v1.1 From 298b9e44be9592e94c0e69a5d3893cd11f5484fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Apr 2010 16:46:36 -0700 Subject: net: include linux/proc_fs.h in dev_addr_lists.c As pointed by Randy Dunlap, we must include linux/proc_fs.h in net/core/dev_addr_lists.c, regardless of CONFIG_PROC_FS Reported-by: Randy Dunlap , Signed-off-by: Eric Dumazet Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 37d5975..508f9c1 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * General list handling functions @@ -667,7 +668,6 @@ void dev_mc_init(struct net_device *dev) EXPORT_SYMBOL(dev_mc_init); #ifdef CONFIG_PROC_FS -#include #include static int dev_mc_seq_show(struct seq_file *seq, void *v) -- cgit v1.1 From 97f8aefbbfb5aa5c9944e5fa8149f1fdaf71c7b6 Mon Sep 17 00:00:00 2001 From: chavey Date: Wed, 7 Apr 2010 21:54:42 -0700 Subject: net: fix ethtool coding style errors and warnings Fix coding style errors and warnings output while running checkpatch.pl on the files net/core/ethtool.c and include/linux/ethtool.h Signed-off-by: chavey Signed-off-by: David S. Miller --- net/core/ethtool.c | 141 ++++++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 67 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 73c81ed..99e9f85 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* * Some useful ethtool_ops methods that're device independent. @@ -30,6 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev) { return netif_carrier_ok(dev) ? 1 : 0; } +EXPORT_SYMBOL(ethtool_op_get_link); u32 ethtool_op_get_rx_csum(struct net_device *dev) { @@ -62,6 +63,7 @@ int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) { @@ -72,11 +74,13 @@ int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); u32 ethtool_op_get_sg(struct net_device *dev) { return (dev->features & NETIF_F_SG) != 0; } +EXPORT_SYMBOL(ethtool_op_get_sg); int ethtool_op_set_sg(struct net_device *dev, u32 data) { @@ -87,11 +91,13 @@ int ethtool_op_set_sg(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_sg); u32 ethtool_op_get_tso(struct net_device *dev) { return (dev->features & NETIF_F_TSO) != 0; } +EXPORT_SYMBOL(ethtool_op_get_tso); int ethtool_op_set_tso(struct net_device *dev, u32 data) { @@ -102,11 +108,13 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tso); u32 ethtool_op_get_ufo(struct net_device *dev) { return (dev->features & NETIF_F_UFO) != 0; } +EXPORT_SYMBOL(ethtool_op_get_ufo); int ethtool_op_set_ufo(struct net_device *dev, u32 data) { @@ -116,6 +124,7 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data) dev->features &= ~NETIF_F_UFO; return 0; } +EXPORT_SYMBOL(ethtool_op_set_ufo); /* the following list of flags are the same as their associated * NETIF_F_xxx values in include/linux/netdevice.h @@ -132,6 +141,7 @@ u32 ethtool_op_get_flags(struct net_device *dev) return dev->features & flags_dup_features; } +EXPORT_SYMBOL(ethtool_op_get_flags); int ethtool_op_set_flags(struct net_device *dev, u32 data) { @@ -160,6 +170,7 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data) dev->features = features; return 0; } +EXPORT_SYMBOL(ethtool_op_set_flags); void ethtool_ntuple_flush(struct net_device *dev) { @@ -205,7 +216,8 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_settings(dev, &cmd); } -static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, + void __user *useraddr) { struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -245,7 +257,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void _ } static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, - void __user *useraddr) + void __user *useraddr) { struct ethtool_sset_info info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -304,7 +316,8 @@ out: return ret; } -static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + void __user *useraddr) { struct ethtool_rxnfc cmd; @@ -317,7 +330,8 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, void __u return dev->ethtool_ops->set_rxnfc(dev, &cmd); } -static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + void __user *useraddr) { struct ethtool_rxnfc info; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -362,8 +376,8 @@ err_out: } static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, - struct ethtool_rx_ntuple_flow_spec *spec, - struct ethtool_rx_ntuple_flow_spec_container *fsc) + struct ethtool_rx_ntuple_flow_spec *spec, + struct ethtool_rx_ntuple_flow_spec_container *fsc) { /* don't add filters forever */ @@ -389,7 +403,8 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, list->count++; } -static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, + void __user *useraddr) { struct ethtool_rx_ntuple cmd; const struct ethtool_ops *ops = dev->ethtool_ops; @@ -514,125 +529,125 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) case UDP_V4_FLOW: case SCTP_V4_FLOW: sprintf(p, "\tSrc IP addr: 0x%x\n", - fsc->fs.h_u.tcp_ip4_spec.ip4src); + fsc->fs.h_u.tcp_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc IP mask: 0x%x\n", - fsc->fs.m_u.tcp_ip4_spec.ip4src); + fsc->fs.m_u.tcp_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP addr: 0x%x\n", - fsc->fs.h_u.tcp_ip4_spec.ip4dst); + fsc->fs.h_u.tcp_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP mask: 0x%x\n", - fsc->fs.m_u.tcp_ip4_spec.ip4dst); + fsc->fs.m_u.tcp_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc Port: %d, mask: 0x%x\n", - fsc->fs.h_u.tcp_ip4_spec.psrc, - fsc->fs.m_u.tcp_ip4_spec.psrc); + fsc->fs.h_u.tcp_ip4_spec.psrc, + fsc->fs.m_u.tcp_ip4_spec.psrc); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest Port: %d, mask: 0x%x\n", - fsc->fs.h_u.tcp_ip4_spec.pdst, - fsc->fs.m_u.tcp_ip4_spec.pdst); + fsc->fs.h_u.tcp_ip4_spec.pdst, + fsc->fs.m_u.tcp_ip4_spec.pdst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tTOS: %d, mask: 0x%x\n", - fsc->fs.h_u.tcp_ip4_spec.tos, - fsc->fs.m_u.tcp_ip4_spec.tos); + fsc->fs.h_u.tcp_ip4_spec.tos, + fsc->fs.m_u.tcp_ip4_spec.tos); p += ETH_GSTRING_LEN; num_strings++; break; case AH_ESP_V4_FLOW: case ESP_V4_FLOW: sprintf(p, "\tSrc IP addr: 0x%x\n", - fsc->fs.h_u.ah_ip4_spec.ip4src); + fsc->fs.h_u.ah_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc IP mask: 0x%x\n", - fsc->fs.m_u.ah_ip4_spec.ip4src); + fsc->fs.m_u.ah_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP addr: 0x%x\n", - fsc->fs.h_u.ah_ip4_spec.ip4dst); + fsc->fs.h_u.ah_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP mask: 0x%x\n", - fsc->fs.m_u.ah_ip4_spec.ip4dst); + fsc->fs.m_u.ah_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSPI: %d, mask: 0x%x\n", - fsc->fs.h_u.ah_ip4_spec.spi, - fsc->fs.m_u.ah_ip4_spec.spi); + fsc->fs.h_u.ah_ip4_spec.spi, + fsc->fs.m_u.ah_ip4_spec.spi); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tTOS: %d, mask: 0x%x\n", - fsc->fs.h_u.ah_ip4_spec.tos, - fsc->fs.m_u.ah_ip4_spec.tos); + fsc->fs.h_u.ah_ip4_spec.tos, + fsc->fs.m_u.ah_ip4_spec.tos); p += ETH_GSTRING_LEN; num_strings++; break; case IP_USER_FLOW: sprintf(p, "\tSrc IP addr: 0x%x\n", - fsc->fs.h_u.raw_ip4_spec.ip4src); + fsc->fs.h_u.raw_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc IP mask: 0x%x\n", - fsc->fs.m_u.raw_ip4_spec.ip4src); + fsc->fs.m_u.raw_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP addr: 0x%x\n", - fsc->fs.h_u.raw_ip4_spec.ip4dst); + fsc->fs.h_u.raw_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP mask: 0x%x\n", - fsc->fs.m_u.raw_ip4_spec.ip4dst); + fsc->fs.m_u.raw_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; break; case IPV4_FLOW: sprintf(p, "\tSrc IP addr: 0x%x\n", - fsc->fs.h_u.usr_ip4_spec.ip4src); + fsc->fs.h_u.usr_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc IP mask: 0x%x\n", - fsc->fs.m_u.usr_ip4_spec.ip4src); + fsc->fs.m_u.usr_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP addr: 0x%x\n", - fsc->fs.h_u.usr_ip4_spec.ip4dst); + fsc->fs.h_u.usr_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP mask: 0x%x\n", - fsc->fs.m_u.usr_ip4_spec.ip4dst); + fsc->fs.m_u.usr_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n", - fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, - fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); + fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, + fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tTOS: %d, mask: 0x%x\n", - fsc->fs.h_u.usr_ip4_spec.tos, - fsc->fs.m_u.usr_ip4_spec.tos); + fsc->fs.h_u.usr_ip4_spec.tos, + fsc->fs.m_u.usr_ip4_spec.tos); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tIP Version: %d, mask: 0x%x\n", - fsc->fs.h_u.usr_ip4_spec.ip_ver, - fsc->fs.m_u.usr_ip4_spec.ip_ver); + fsc->fs.h_u.usr_ip4_spec.ip_ver, + fsc->fs.m_u.usr_ip4_spec.ip_ver); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tProtocol: %d, mask: 0x%x\n", - fsc->fs.h_u.usr_ip4_spec.proto, - fsc->fs.m_u.usr_ip4_spec.proto); + fsc->fs.h_u.usr_ip4_spec.proto, + fsc->fs.m_u.usr_ip4_spec.proto); p += ETH_GSTRING_LEN; num_strings++; break; }; sprintf(p, "\tVLAN: %d, mask: 0x%x\n", - fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); + fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data); @@ -645,7 +660,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) sprintf(p, "\tAction: Drop\n"); else sprintf(p, "\tAction: Direct to queue %d\n", - fsc->fs.action); + fsc->fs.action); p += ETH_GSTRING_LEN; num_strings++; unknown_filter: @@ -857,7 +872,8 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) return ret; } -static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, + void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; @@ -871,7 +887,8 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void return 0; } -static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, + void __user *useraddr) { struct ethtool_coalesce coalesce; @@ -975,6 +992,7 @@ static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_tx_csum(dev, edata.data); } +EXPORT_SYMBOL(ethtool_op_set_tx_csum); static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) { @@ -1046,7 +1064,7 @@ static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) edata.data = dev->features & NETIF_F_GSO; if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; + return -EFAULT; return 0; } @@ -1069,7 +1087,7 @@ static int ethtool_get_gro(struct net_device *dev, char __user *useraddr) edata.data = dev->features & NETIF_F_GRO; if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; + return -EFAULT; return 0; } @@ -1281,7 +1299,8 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr, return actor(dev, edata.data); } -static noinline_for_stack int ethtool_flash_device(struct net_device *dev, char __user *useraddr) +static noinline_for_stack int ethtool_flash_device(struct net_device *dev, + char __user *useraddr) { struct ethtool_flash efl; @@ -1310,11 +1329,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev->ethtool_ops) return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; /* Allow some commands to be done by anyone */ - switch(ethcmd) { + switch (ethcmd) { case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GCOALESCE: @@ -1342,10 +1361,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return -EPERM; } - if (dev->ethtool_ops->begin) - if ((rc = dev->ethtool_ops->begin(dev)) < 0) + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) return rc; - + } old_features = dev->features; switch (ethcmd) { @@ -1535,16 +1555,3 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return rc; } - -EXPORT_SYMBOL(ethtool_op_get_link); -EXPORT_SYMBOL(ethtool_op_get_sg); -EXPORT_SYMBOL(ethtool_op_get_tso); -EXPORT_SYMBOL(ethtool_op_set_sg); -EXPORT_SYMBOL(ethtool_op_set_tso); -EXPORT_SYMBOL(ethtool_op_set_tx_csum); -EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); -EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); -EXPORT_SYMBOL(ethtool_op_set_ufo); -EXPORT_SYMBOL(ethtool_op_get_ufo); -EXPORT_SYMBOL(ethtool_op_set_flags); -EXPORT_SYMBOL(ethtool_op_get_flags); -- cgit v1.1 From 7a161ea92471087a1579239d7a58dd06eaa5601c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Apr 2010 21:26:13 +0000 Subject: net: Dont use netdev_warn() Dont use netdev_warn() in dev_cap_txqueue() and get_rps_cpu() so that we can catch following warnings without crash. bond0.2240 received packet on queue 6, but number of RX queues is 1 bond0.2240 received packet on queue 11, but number of RX queues is 1 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a10a216..0eb79e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1987,9 +1987,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) { if (unlikely(queue_index >= dev->real_num_tx_queues)) { if (net_ratelimit()) { - netdev_warn(dev, "selects TX queue %d, but " - "real number of TX queues is %d\n", - queue_index, dev->real_num_tx_queues); + pr_warning("%s selects TX queue %d, but " + "real number of TX queues is %d\n", + dev->name, queue_index, dev->real_num_tx_queues); } return 0; } @@ -2223,9 +2223,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->num_rx_queues)) { if (net_ratelimit()) { - netdev_warn(dev, "received packet on queue " - "%u, but number of RX queues is %u\n", - index, dev->num_rx_queues); + pr_warning("%s received packet on queue " + "%u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); } goto done; } -- cgit v1.1 From b6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Apr 2010 23:03:29 +0000 Subject: net: sk_dst_cache RCUification With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this work. sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock) This rwlock is readlocked for a very small amount of time, and dst entries are already freed after RCU grace period. This calls for RCU again :) This patch converts sk_dst_lock to a spinlock, and use RCU for readers. __sk_dst_get() is supposed to be called with rcu_read_lock() or if socket locked by user, so use appropriate rcu_dereference_check() condition (rcu_read_lock_held() || sock_owned_by_user(sk)) This patch avoids two atomic ops per tx packet on UDP connected sockets, for example, and permits sk_dst_lock to be much less dirtied. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/sock.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0eb79e3..ca4cdef 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2015,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues > 1) queue_index = skb_tx_hash(dev, skb); - if (sk && sk->sk_dst_cache) + if (sk && rcu_dereference_check(sk->sk_dst_cache, 1)) sk_tx_queue_set(sk, queue_index); } } diff --git a/net/core/sock.c b/net/core/sock.c index c5812bb..7effa1e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { - struct dst_entry *dst = sk->sk_dst_cache; + struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); - sk->sk_dst_cache = NULL; + rcu_assign_pointer(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } @@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_async_wait_queue); #endif - rwlock_init(&newsk->sk_dst_lock); + spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_sleep = NULL; - rwlock_init(&sk->sk_dst_lock); + spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, -- cgit v1.1 From acbbc07145b919248c410e1852b953d385be5c97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 11 Apr 2010 06:56:11 +0000 Subject: net: uninline skb_bond_should_drop() skb_bond_should_drop() is too big to be inlined. This patch reduces kernel text size, and its compilation time as well (shrinking include/linux/netdevice.h) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ca4cdef..876b111 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2646,6 +2646,55 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, + struct net_device *master) +{ + if (skb->pkt_type == PACKET_HOST) { + u16 *dest = (u16 *) eth_hdr(skb)->h_dest; + + memcpy(dest, master->dev_addr, ETH_ALEN); + } +} + +/* On bonding slaves other than the currently active slave, suppress + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and + * ARP on active-backup slaves with arp_validate enabled. + */ +int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) +{ + struct net_device *dev = skb->dev; + + if (master->priv_flags & IFF_MASTER_ARPMON) + dev->last_rx = jiffies; + + if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) { + /* Do address unmangle. The local destination address + * will be always the one master has. Provides the right + * functionality in a bridge. + */ + skb_bond_set_mac_by_master(skb, master); + } + + if (dev->priv_flags & IFF_SLAVE_INACTIVE) { + if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && + skb->protocol == __cpu_to_be16(ETH_P_ARP)) + return 0; + + if (master->priv_flags & IFF_MASTER_ALB) { + if (skb->pkt_type != PACKET_BROADCAST && + skb->pkt_type != PACKET_MULTICAST) + return 0; + } + if (master->priv_flags & IFF_MASTER_8023AD && + skb->protocol == __cpu_to_be16(ETH_P_SLOW)) + return 0; + + return 1; + } + return 0; +} +EXPORT_SYMBOL(__skb_bond_should_drop); + static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; -- cgit v1.1 From 561155110307ad304226a23272244398fa46cbae Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 12 Apr 2010 07:38:05 +0000 Subject: dst: don't inline dst_ifdown The function dst_ifdown is called only two places but in a non- performance critical code path, there is no reason to inline it. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dst.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index b8c22f0..9920722 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -286,8 +286,8 @@ EXPORT_SYMBOL(dst_release); * * Commented and originally written by Alexey. */ -static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) +static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) { if (dst->ops->ifdown) dst->ops->ifdown(dst, dev, unregister); -- cgit v1.1 From d8a566beaa75c6ad5e38cdccf0ea5294323e7866 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 13 Apr 2010 05:03:15 +0000 Subject: net: fib_rules: consolidate IPv4 and DECnet ->default_pref() functions. Both functions are equivalent, consolidate them since a following patch needs a third implementation for multicast routing. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 05cce4e..1eb3227 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -39,6 +39,24 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); +u32 fib_default_rule_pref(struct fib_rules_ops *ops) +{ + struct list_head *pos; + struct fib_rule *rule; + + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { + rule = list_entry(pos->next, struct fib_rule, list); + if (rule->pref) + return rule->pref - 1; + } + } + + return 0; +} +EXPORT_SYMBOL(fib_default_rule_pref); + static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -- cgit v1.1 From 28bb17268b92b0c568f2496e5e631008f9108409 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 13 Apr 2010 05:03:16 +0000 Subject: net: fib_rules: set family in fib_rule_hdr centrally All fib_rules implementations need to set the family in their ->fill() functions. Since the value is available to the generic fib_nl_fill_rule() function, set it there. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1eb3227..1bc6659 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -535,6 +535,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, return -EMSGSIZE; frh = nlmsg_data(nlh); + frh->family = ops->family; frh->table = rule->table; NLA_PUT_U32(skb, FRA_TABLE, rule->table); frh->res1 = 0; -- cgit v1.1 From 0f87b1dd01b51dc3c789f7a212656a4a87eee1bd Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 13 Apr 2010 05:03:17 +0000 Subject: net: fib_rules: decouple address families from real address families Decouple the address family values used for fib_rules from the real address families in socket.h. This allows to use fib_rules for code that is not a real address family without increasing AF_MAX/NPROTO. Values up to 127 are reserved for real address families and map directly to the corresponding AF value, values starting from 128 are for other uses. rtnetlink is changed to invoke the AF_UNSPEC dumpit/doit handlers for these families. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bf919b6..78c85985c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -118,7 +118,11 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) { struct rtnl_link *tab; - tab = rtnl_msg_handlers[protocol]; + if (protocol < NPROTO) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + if (tab == NULL || tab[msgindex].doit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; @@ -129,7 +133,11 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) { struct rtnl_link *tab; - tab = rtnl_msg_handlers[protocol]; + if (protocol < NPROTO) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + if (tab == NULL || tab[msgindex].dumpit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; @@ -1444,9 +1452,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) - return -EAFNOSUPPORT; - sz_idx = type>>2; kind = type&3; -- cgit v1.1 From b0e28f1effd1d840b36e961edc1def81e01b1ca1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Apr 2010 00:14:07 -0700 Subject: net: netif_rx() must disable preemption Eric Paris reported netif_rx() is calling smp_processor_id() from preemptible context, in particular when caller is ip_dev_loopback_xmit(). RPS commit added this smp_processor_id() call, this patch makes sure preemption is disabled. rps_get_cpus() wants rcu_read_lock() anyway, we can dot it a bit earlier. Reported-by: Eric Paris Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 876b111..e8041eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2206,6 +2206,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) { @@ -2217,8 +2218,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) u8 ip_proto; u32 addr1, addr2, ports, ihl; - rcu_read_lock(); - if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->num_rx_queues)) { @@ -2296,7 +2295,6 @@ got_hash: } done: - rcu_read_unlock(); return cpu; } @@ -2392,7 +2390,7 @@ enqueue: int netif_rx(struct sk_buff *skb) { - int cpu; + int ret; /* if netpoll wants it, pretend we never saw it */ if (netpoll_rx(skb)) @@ -2402,14 +2400,21 @@ int netif_rx(struct sk_buff *skb) net_timestamp(skb); #ifdef CONFIG_RPS - cpu = get_rps_cpu(skb->dev, skb); - if (cpu < 0) - cpu = smp_processor_id(); + { + int cpu; + + rcu_read_lock(); + cpu = get_rps_cpu(skb->dev, skb); + if (cpu < 0) + cpu = smp_processor_id(); + ret = enqueue_to_backlog(skb, cpu); + rcu_read_unlock(); + } #else - cpu = smp_processor_id(); + ret = enqueue_to_backlog(skb, get_cpu()); + put_cpu(); #endif - - return enqueue_to_backlog(skb, cpu); + return ret; } EXPORT_SYMBOL(netif_rx); -- cgit v1.1 From fec5e652e58fa6017b2c9e06466cb2a6538de5b4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 16 Apr 2010 16:01:27 -0700 Subject: rfs: Receive Flow Steering This patch implements receive flow steering (RFS). RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running. RFS is an extension of Receive Packet Steering (RPS). The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure. The rxhash is passed in skb's received on the connection from netif_receive_skb. For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms. The convolution of the simple approach is that it would potentially allow OOO packets. If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter. To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table. rps_sock_table is a global hash table. Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above. This table contains the "desired" CPUs for flows. rps_dev_flow_table is specific to each device queue. Each entry contains a CPU and a tail queue counter. The CPU is the "current" CPU for a matching flow. The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry. Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length. When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table. And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted. When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true: - The current CPU is unset (equal to RPS_NO_CPU) - Current CPU is offline - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table. This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry. This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery. Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality. 2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant. This patch implements RFS for TCP and connected UDP sockets. It should be usable for other flow oriented protocols. There are two configuration parameters for RFS. The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue. Both are rounded to power of two. The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency). The benefits of RFS are dependent on cache hierarchy, application load, and other factors. On simple benchmarks, we don't necessarily see improvement and sometimes see degradation. However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf. e1000e on 8 core Intel No RFS or RPS 104K tps at 30% CPU No RFS (best RPS config): 290K tps at 63% CPU RFS 303K tps at 61% CPU RPC test tps CPU% 50/90/99% usec latency Latency StdDev No RFS/RPS 103K 48% 757/900/3185 4472.35 RPS only: 174K 73% 415/993/2468 491.66 RFS 223K 73% 379/651/1382 315.61 Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 111 +++++++++++++++++++++++++++++++++++++-------- net/core/net-sysfs.c | 94 ++++++++++++++++++++++++++++++++++++-- net/core/sysctl_net_core.c | 68 +++++++++++++++++++++++++++ 3 files changed, 250 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e8041eb..d7107ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2203,19 +2203,28 @@ int weight_p __read_mostly = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; #ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table; +EXPORT_SYMBOL(rps_sock_flow_table); + /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) { struct ipv6hdr *ip6; struct iphdr *ip; struct netdev_rx_queue *rxqueue; struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; int cpu = -1; u8 ip_proto; + u16 tcpu; u32 addr1, addr2, ports, ihl; if (skb_rx_queue_recorded(skb)) { @@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) } else rxqueue = dev->_rx; - if (!rxqueue->rps_map) + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) goto done; if (skb->rxhash) @@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) skb->rxhash = 1; got_hash: + flow_table = rcu_dereference(rxqueue->rps_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + u16 next_cpu; + struct rps_dev_flow *rflow; + + rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + tcpu = rflow->cpu; + + next_cpu = sock_flow_table->ents[skb->rxhash & + sock_flow_table->mask]; + + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) { + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) + rflow->last_qtail = per_cpu(softnet_data, + tcpu).input_queue_head; + } + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + map = rcu_dereference(rxqueue->rps_map); if (map) { - u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; if (cpu_online(tcpu)) { cpu = tcpu; @@ -2320,13 +2368,14 @@ static void trigger_softirq(void *data) __napi_schedule(&queue->backlog); __get_cpu_var(netdev_rx_stat).received_rps++; } -#endif /* CONFIG_SMP */ +#endif /* CONFIG_RPS */ /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ -static int enqueue_to_backlog(struct sk_buff *skb, int cpu) +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) { struct softnet_data *queue; unsigned long flags; @@ -2341,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu) if (queue->input_pkt_queue.qlen) { enqueue: __skb_queue_tail(&queue->input_pkt_queue, skb); +#ifdef CONFIG_RPS + *qtail = queue->input_queue_head + + queue->input_pkt_queue.qlen; +#endif rps_unlock(queue); local_irq_restore(flags); return NET_RX_SUCCESS; @@ -2355,11 +2408,10 @@ enqueue: cpu_set(cpu, rcpus->mask[rcpus->select]); __raise_softirq_irqoff(NET_RX_SOFTIRQ); - } else - __napi_schedule(&queue->backlog); -#else - __napi_schedule(&queue->backlog); + goto enqueue; + } #endif + __napi_schedule(&queue->backlog); } goto enqueue; } @@ -2401,18 +2453,25 @@ int netif_rx(struct sk_buff *skb) #ifdef CONFIG_RPS { + struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); - cpu = get_rps_cpu(skb->dev, skb); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); - ret = enqueue_to_backlog(skb, cpu); + + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); } #else - ret = enqueue_to_backlog(skb, get_cpu()); - put_cpu(); + { + unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); + put_cpu(); + } #endif return ret; } @@ -2830,14 +2889,22 @@ out: int netif_receive_skb(struct sk_buff *skb) { #ifdef CONFIG_RPS - int cpu; + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; + + rcu_read_lock(); - cpu = get_rps_cpu(skb->dev, skb); + cpu = get_rps_cpu(skb->dev, skb, &rflow); - if (cpu < 0) - return __netif_receive_skb(skb); - else - return enqueue_to_backlog(skb, cpu); + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + ret = __netif_receive_skb(skb); + } + + return ret; #else return __netif_receive_skb(skb); #endif @@ -2856,6 +2923,7 @@ static void flush_backlog(void *arg) if (skb->dev == dev) { __skb_unlink(skb, &queue->input_pkt_queue); kfree_skb(skb); + incr_input_queue_head(queue); } rps_unlock(queue); } @@ -3179,6 +3247,7 @@ static int process_backlog(struct napi_struct *napi, int quota) local_irq_enable(); break; } + incr_input_queue_head(queue); rps_unlock(queue); local_irq_enable(); @@ -5542,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, local_irq_enable(); /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); + incr_input_queue_head(oldsd); + } return NOTIFY_OK; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 96ed690..143052a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "net-sysfs.h" @@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, return len; } +static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + char *buf) +{ + struct rps_dev_flow_table *flow_table; + unsigned int val = 0; + + rcu_read_lock(); + flow_table = rcu_dereference(queue->rps_flow_table); + if (flow_table) + val = flow_table->mask + 1; + rcu_read_unlock(); + + return sprintf(buf, "%u\n", val); +} + +static void rps_dev_flow_table_release_work(struct work_struct *work) +{ + struct rps_dev_flow_table *table = container_of(work, + struct rps_dev_flow_table, free_work); + + vfree(table); +} + +static void rps_dev_flow_table_release(struct rcu_head *rcu) +{ + struct rps_dev_flow_table *table = container_of(rcu, + struct rps_dev_flow_table, rcu); + + INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); + schedule_work(&table->free_work); +} + +ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + const char *buf, size_t len) +{ + unsigned int count; + char *endp; + struct rps_dev_flow_table *table, *old_table; + static DEFINE_SPINLOCK(rps_dev_flow_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + count = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + if (count) { + int i; + + if (count > 1<<30) { + /* Enforce a limit to prevent overflow */ + return -EINVAL; + } + count = roundup_pow_of_two(count); + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); + if (!table) + return -ENOMEM; + + table->mask = count - 1; + for (i = 0; i < count; i++) + table->flows[i].cpu = RPS_NO_CPU; + } else + table = NULL; + + spin_lock(&rps_dev_flow_lock); + old_table = queue->rps_flow_table; + rcu_assign_pointer(queue->rps_flow_table, table); + spin_unlock(&rps_dev_flow_lock); + + if (old_table) + call_rcu(&old_table->rcu, rps_dev_flow_table_release); + + return len; +} + static struct rx_queue_attribute rps_cpus_attribute = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = + __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); + static struct attribute *rx_queue_default_attrs[] = { &rps_cpus_attribute.attr, + &rps_dev_flow_table_cnt_attribute.attr, NULL }; static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); - struct rps_map *map = queue->rps_map; struct netdev_rx_queue *first = queue->first; - if (map) - call_rcu(&map->rcu, rps_map_release); + if (queue->rps_map) + call_rcu(&queue->rps_map->rcu, rps_map_release); + + if (queue->rps_flow_table) + call_rcu(&queue->rps_flow_table->rcu, + rps_dev_flow_table_release); if (atomic_dec_and_test(&first->count)) kfree(first); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b7b6b82..dcc7d25 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -11,12 +11,72 @@ #include #include #include +#include #include #include #include #include +#ifdef CONFIG_RPS +static int rps_sock_flow_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int orig_size, size; + int ret, i; + ctl_table tmp = { + .data = &size, + .maxlen = sizeof(size), + .mode = table->mode + }; + struct rps_sock_flow_table *orig_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + + mutex_lock(&sock_flow_mutex); + + orig_sock_table = rps_sock_flow_table; + size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + if (write) { + if (size) { + if (size > 1<<30) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = + vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); + if (!sock_table) { + mutex_unlock(&sock_flow_mutex); + return -ENOMEM; + } + + sock_table->mask = size - 1; + } else + sock_table = orig_sock_table; + + for (i = 0; i < size; i++) + sock_table->ents[i] = RPS_NO_CPU; + } else + sock_table = NULL; + + if (sock_table != orig_sock_table) { + rcu_assign_pointer(rps_sock_flow_table, sock_table); + synchronize_rcu(); + vfree(orig_sock_table); + } + } + + mutex_unlock(&sock_flow_mutex); + + return ret; +} +#endif /* CONFIG_RPS */ + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -82,6 +142,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_RPS + { + .procname = "rps_sock_flow_entries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rps_sock_flow_sysctl + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", -- cgit v1.1 From 8770acf0494ae06de6abd34f951a436f8f15d1de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 17 Apr 2010 00:54:36 -0700 Subject: rps: rps_sock_flow_table is mostly read Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d7107ac..7abf959 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2205,7 +2205,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table; +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); /* -- cgit v1.1 From 9958da0501fced47c1ac5c5a3a7731c87e45472c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 17 Apr 2010 04:17:02 +0000 Subject: net: remove time limit in process_backlog() - There is no point to enforce a time limit in process_backlog(), since other napi instances dont follow same rule. We can exit after only one packet processed... The normal quota of 64 packets per napi instance should be the norm, and net_rx_action() already has its own time limit. Note : /proc/net/core/dev_weight can be used to tune this 64 default value. - Use DEFINE_PER_CPU_ALIGNED for softnet_data definition. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7abf959..8092f01 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -264,7 +264,7 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); #ifdef CONFIG_LOCKDEP @@ -3232,7 +3232,6 @@ static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; napi->weight = weight_p; do { @@ -3252,7 +3251,7 @@ static int process_backlog(struct napi_struct *napi, int quota) local_irq_enable(); __netif_receive_skb(skb); - } while (++work < quota && jiffies == start_time); + } while (++work < quota); return work; } -- cgit v1.1 From fc6055a5ba31e2c14e36e8939f9bf2b6d586a7f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Apr 2010 12:18:22 +0000 Subject: net: Introduce skb_orphan_try() Transmitted skb might be attached to a socket and a destructor, for memory accounting purposes. Traditionally, this destructor is called at tx completion time, when skb is freed. When tx completion is performed by another cpu than the sender, this forces some cache lines to change ownership. XPS was an attempt to give tx completion to initial cpu. David idea is to call destructor right before giving skb to device (call to ndo_start_xmit()). Because device queues are usually small, orphaning skb before tx completion is not a big deal. Some drivers already do this, we could do it in upper level. There is one known exception to this early orphaning, called tx timestamping. It needs to keep a reference to socket until device can give a hardware or software timestamp. This patch adds a skb_orphan_try() helper, to centralize all exceptions to early orphaning in one spot, and use it in dev_hard_start_xmit(). "tbench 16" results on a Nehalem machine (2 X5570 @ 2.93GHz) before: Throughput 4428.9 MB/sec 16 procs after: Throughput 4448.14 MB/sec 16 procs UDP should get even better results, its destructor being more complex, since SOCK_USE_WRITE_QUEUE is not set (four atomic ops instead of one) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8092f01..8eb50e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1880,6 +1880,17 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +/* + * Try to orphan skb early, right before transmission by the device. + * We cannot orphan skb if tx timestamp is requested, since + * drivers need to call skb_tstamp_tx() to send the timestamp. + */ +static inline void skb_orphan_try(struct sk_buff *skb) +{ + if (!skb_tx(skb)->flags) + skb_orphan(skb); +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { @@ -1904,23 +1915,10 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); + skb_orphan_try(skb); rc = ops->ndo_start_xmit(skb, dev); if (rc == NETDEV_TX_OK) txq_trans_update(txq); - /* - * TODO: if skb_orphan() was called by - * dev->hard_start_xmit() (for example, the unmodified - * igb driver does that; bnx2 doesn't), then - * skb_tx_software_timestamp() will be unable to send - * back the time stamp. - * - * How can this be prevented? Always create another - * reference to the socket before calling - * dev->hard_start_xmit()? Prevent that skb_orphan() - * does anything in dev->hard_start_xmit() by clearing - * the skb destructor before the call and restoring it - * afterwards, then doing the skb_orphan() ourselves? - */ return rc; } @@ -1938,6 +1936,7 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + skb_orphan_try(nskb); rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) -- cgit v1.1 From 88751275b8e867d756e4f86ae92afe0232de129f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Apr 2010 05:07:33 +0000 Subject: rps: shortcut net_rps_action() net_rps_action() is a bit expensive on NR_CPUS=64..4096 kernels, even if RPS is not active. Tom Herbert used two bitmasks to hold information needed to send IPI, but a single LIFO list seems more appropriate. Move all RPS logic into net_rps_action() to cleanup net_rx_action() code (remove two ifdefs) Move rps_remote_softirq_cpus into softnet_data to share its first cache line, filling an existing hole. In a future patch, we could call net_rps_action() from process_backlog() to make sure we send IPI before handling this cpu backlog. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 79 ++++++++++++++++++++++++---------------------------------- 1 file changed, 32 insertions(+), 47 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8eb50e2..05a2b29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2345,21 +2345,6 @@ done: return cpu; } -/* - * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled - * to be sent to kick remote softirq processing. There are two masks since - * the sending of IPIs must be done with interrupts enabled. The select field - * indicates the current mask that enqueue_backlog uses to schedule IPIs. - * select is flipped before net_rps_action is called while still under lock, - * net_rps_action then uses the non-selected mask to send the IPIs and clears - * it without conflicting with enqueue_backlog operation. - */ -struct rps_remote_softirq_cpus { - cpumask_t mask[2]; - int select; -}; -static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus); - /* Called from hardirq (IPI) context */ static void trigger_softirq(void *data) { @@ -2402,10 +2387,12 @@ enqueue: if (napi_schedule_prep(&queue->backlog)) { #ifdef CONFIG_RPS if (cpu != smp_processor_id()) { - struct rps_remote_softirq_cpus *rcpus = - &__get_cpu_var(rps_remote_softirq_cpus); + struct softnet_data *myqueue; + + myqueue = &__get_cpu_var(softnet_data); + queue->rps_ipi_next = myqueue->rps_ipi_list; + myqueue->rps_ipi_list = queue; - cpu_set(cpu, rcpus->mask[rcpus->select]); __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto enqueue; } @@ -2910,7 +2897,9 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); -/* Network device is going away, flush any packets still pending */ +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */ static void flush_backlog(void *arg) { struct net_device *dev = arg; @@ -3338,24 +3327,33 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); -#ifdef CONFIG_RPS /* - * net_rps_action sends any pending IPI's for rps. This is only called from - * softirq and interrupts must be enabled. + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. */ -static void net_rps_action(cpumask_t *mask) +static void net_rps_action(void) { - int cpu; +#ifdef CONFIG_RPS + struct softnet_data *locqueue = &__get_cpu_var(softnet_data); + struct softnet_data *remqueue = locqueue->rps_ipi_list; - /* Send pending IPI's to kick RPS processing on remote cpus. */ - for_each_cpu_mask_nr(cpu, *mask) { - struct softnet_data *queue = &per_cpu(softnet_data, cpu); - if (cpu_online(cpu)) - __smp_call_function_single(cpu, &queue->csd, 0); - } - cpus_clear(*mask); -} + if (remqueue) { + locqueue->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remqueue) { + struct softnet_data *next = remqueue->rps_ipi_next; + if (cpu_online(remqueue->cpu)) + __smp_call_function_single(remqueue->cpu, + &remqueue->csd, 0); + remqueue = next; + } + } else #endif + local_irq_enable(); +} static void net_rx_action(struct softirq_action *h) { @@ -3363,10 +3361,6 @@ static void net_rx_action(struct softirq_action *h) unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; -#ifdef CONFIG_RPS - int select; - struct rps_remote_softirq_cpus *rcpus; -#endif local_irq_disable(); @@ -3429,17 +3423,7 @@ static void net_rx_action(struct softirq_action *h) netpoll_poll_unlock(have); } out: -#ifdef CONFIG_RPS - rcpus = &__get_cpu_var(rps_remote_softirq_cpus); - select = rcpus->select; - rcpus->select ^= 1; - - local_irq_enable(); - - net_rps_action(&rcpus->mask[select]); -#else - local_irq_enable(); -#endif + net_rps_action(); #ifdef CONFIG_NET_DMA /* @@ -5839,6 +5823,7 @@ static int __init net_dev_init(void) queue->csd.func = trigger_softirq; queue->csd.info = queue; queue->csd.flags = 0; + queue->cpu = i; #endif queue->backlog.poll = process_backlog; -- cgit v1.1 From f5acb907dc24c3822f408211bad1cd6e5d0433cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Apr 2010 14:40:57 -0700 Subject: rps: static functions store_rps_map() & store_rps_dev_flow_table_cnt() are static. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 143052a..c57c4b2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -550,7 +550,7 @@ static void rps_map_release(struct rcu_head *rcu) kfree(map); } -ssize_t store_rps_map(struct netdev_rx_queue *queue, +static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, const char *buf, size_t len) { @@ -635,7 +635,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu) schedule_work(&table->free_work); } -ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, +static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, struct rx_queue_attribute *attr, const char *buf, size_t len) { -- cgit v1.1 From e36fa2f7e92f25aab2e3d787dcfe3590817f19d3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Apr 2010 21:17:14 +0000 Subject: rps: cleanups struct softnet_data holds many queues, so consistent use "sd" name instead of "queue" is better. Adds a rps_ipi_queued() helper to cleanup enqueue_to_backlog() Adds a _and_irq_disable suffix to net_rps_action() name, as David suggested. incr_input_queue_head() becomes input_queue_head_incr() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 149 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 80 insertions(+), 69 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 05a2b29..7f5755b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -208,17 +208,17 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *queue) +static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_lock(&queue->input_pkt_queue.lock); + spin_lock(&sd->input_pkt_queue.lock); #endif } -static inline void rps_unlock(struct softnet_data *queue) +static inline void rps_unlock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_unlock(&queue->input_pkt_queue.lock); + spin_unlock(&sd->input_pkt_queue.lock); #endif } @@ -2346,63 +2346,74 @@ done: } /* Called from hardirq (IPI) context */ -static void trigger_softirq(void *data) +static void rps_trigger_softirq(void *data) { - struct softnet_data *queue = data; - __napi_schedule(&queue->backlog); + struct softnet_data *sd = data; + + __napi_schedule(&sd->backlog); __get_cpu_var(netdev_rx_stat).received_rps++; } + #endif /* CONFIG_RPS */ /* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int rps_ipi_queued(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *mysd = &__get_cpu_var(softnet_data); + + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + return 0; +} + +/* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { - struct softnet_data *queue; + struct softnet_data *sd; unsigned long flags; - queue = &per_cpu(softnet_data, cpu); + sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); __get_cpu_var(netdev_rx_stat).total++; - rps_lock(queue); - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { + rps_lock(sd); + if (sd->input_pkt_queue.qlen <= netdev_max_backlog) { + if (sd->input_pkt_queue.qlen) { enqueue: - __skb_queue_tail(&queue->input_pkt_queue, skb); + __skb_queue_tail(&sd->input_pkt_queue, skb); #ifdef CONFIG_RPS - *qtail = queue->input_queue_head + - queue->input_pkt_queue.qlen; + *qtail = sd->input_queue_head + sd->input_pkt_queue.qlen; #endif - rps_unlock(queue); + rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device */ - if (napi_schedule_prep(&queue->backlog)) { -#ifdef CONFIG_RPS - if (cpu != smp_processor_id()) { - struct softnet_data *myqueue; - - myqueue = &__get_cpu_var(softnet_data); - queue->rps_ipi_next = myqueue->rps_ipi_list; - myqueue->rps_ipi_list = queue; - - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto enqueue; - } -#endif - __napi_schedule(&queue->backlog); + if (napi_schedule_prep(&sd->backlog)) { + if (!rps_ipi_queued(sd)) + __napi_schedule(&sd->backlog); } goto enqueue; } - rps_unlock(queue); + rps_unlock(sd); __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); @@ -2903,17 +2914,17 @@ EXPORT_SYMBOL(netif_receive_skb); static void flush_backlog(void *arg) { struct net_device *dev = arg; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &__get_cpu_var(softnet_data); struct sk_buff *skb, *tmp; - rps_lock(queue); - skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) + rps_lock(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) if (skb->dev == dev) { - __skb_unlink(skb, &queue->input_pkt_queue); + __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); - incr_input_queue_head(queue); + input_queue_head_incr(sd); } - rps_unlock(queue); + rps_unlock(sd); } static int napi_gro_complete(struct sk_buff *skb) @@ -3219,23 +3230,23 @@ EXPORT_SYMBOL(napi_gro_frags); static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &__get_cpu_var(softnet_data); napi->weight = weight_p; do { struct sk_buff *skb; local_irq_disable(); - rps_lock(queue); - skb = __skb_dequeue(&queue->input_pkt_queue); + rps_lock(sd); + skb = __skb_dequeue(&sd->input_pkt_queue); if (!skb) { __napi_complete(napi); - rps_unlock(queue); + rps_unlock(sd); local_irq_enable(); break; } - incr_input_queue_head(queue); - rps_unlock(queue); + input_queue_head_incr(sd); + rps_unlock(sd); local_irq_enable(); __netif_receive_skb(skb); @@ -3331,24 +3342,25 @@ EXPORT_SYMBOL(netif_napi_del); * net_rps_action sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ -static void net_rps_action(void) +static void net_rps_action_and_irq_disable(void) { #ifdef CONFIG_RPS - struct softnet_data *locqueue = &__get_cpu_var(softnet_data); - struct softnet_data *remqueue = locqueue->rps_ipi_list; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *remsd = sd->rps_ipi_list; - if (remqueue) { - locqueue->rps_ipi_list = NULL; + if (remsd) { + sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ - while (remqueue) { - struct softnet_data *next = remqueue->rps_ipi_next; - if (cpu_online(remqueue->cpu)) - __smp_call_function_single(remqueue->cpu, - &remqueue->csd, 0); - remqueue = next; + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + __smp_call_function_single(remsd->cpu, + &remsd->csd, 0); + remsd = next; } } else #endif @@ -3423,7 +3435,7 @@ static void net_rx_action(struct softirq_action *h) netpoll_poll_unlock(have); } out: - net_rps_action(); + net_rps_action_and_irq_disable(); #ifdef CONFIG_NET_DMA /* @@ -5595,7 +5607,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - incr_input_queue_head(oldsd); + input_queue_head_incr(oldsd); } return NOTIFY_OK; @@ -5812,24 +5824,23 @@ static int __init net_dev_init(void) */ for_each_possible_cpu(i) { - struct softnet_data *queue; + struct softnet_data *sd = &per_cpu(softnet_data, i); - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); + skb_queue_head_init(&sd->input_pkt_queue); + sd->completion_queue = NULL; + INIT_LIST_HEAD(&sd->poll_list); #ifdef CONFIG_RPS - queue->csd.func = trigger_softirq; - queue->csd.info = queue; - queue->csd.flags = 0; - queue->cpu = i; + sd->csd.func = rps_trigger_softirq; + sd->csd.info = sd; + sd->csd.flags = 0; + sd->cpu = i; #endif - queue->backlog.poll = process_backlog; - queue->backlog.weight = weight_p; - queue->backlog.gro_list = NULL; - queue->backlog.gro_count = 0; + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; + sd->backlog.gro_list = NULL; + sd->backlog.gro_count = 0; } dev_boot_phase = 0; -- cgit v1.1 From b249dcb82d327e419d3cb45773b146ebb5faf419 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Apr 2010 21:56:38 +0000 Subject: rps: consistent rxhash In case we compute a software skb->rxhash, we can generate a consistent hash : Its value will be the same in both flow directions. This helps some workloads, like conntracking, since the same state needs to be accessed in both directions. tbench + RFS + this patch gives better results than tbench with default kernel configuration (no RPS, no RFS) Also fixed some sparse warnings. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7f5755b..0d78e04 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1974,7 +1974,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else - hash = skb->protocol; + hash = (__force u16) skb->protocol; hash = jhash_1word(hash, hashrnd); @@ -2253,8 +2253,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ip = (struct iphdr *) skb->data; ip_proto = ip->protocol; - addr1 = ip->saddr; - addr2 = ip->daddr; + addr1 = (__force u32) ip->saddr; + addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): @@ -2263,8 +2263,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ip6 = (struct ipv6hdr *) skb->data; ip_proto = ip6->nexthdr; - addr1 = ip6->saddr.s6_addr32[3]; - addr2 = ip6->daddr.s6_addr32[3]; + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; ihl = (40 >> 2); break; default: @@ -2279,14 +2279,25 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) - ports = *((u32 *) (skb->data + (ihl * 4))); + if (pskb_may_pull(skb, (ihl * 4) + 4)) { + __be16 *hports = (__be16 *) (skb->data + (ihl * 4)); + u32 sport, dport; + + sport = (__force u16) hports[0]; + dport = (__force u16) hports[1]; + if (dport < sport) + swap(sport, dport); + ports = (sport << 16) + dport; + } break; default: break; } + /* get a consistent hash (same value on both flow directions) */ + if (addr2 < addr1) + swap(addr1, addr2); skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd); if (!skb->rxhash) skb->rxhash = 1; -- cgit v1.1 From ab9304717f7624c41927f442e6b6d418b2d8b3e4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 20 Apr 2010 01:45:37 -0700 Subject: net: emphasize rtnl lock required in call_netdevice_notifiers Since netdev_chain is guarded by rtnl_lock, ASSERT_RTNL should be present here to make sure that all callers of call_netdevice_notifiers does the locking properly. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0d78e04..b31d5d6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1435,6 +1435,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } -- cgit v1.1 From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 6 +++--- net/core/sock.c | 16 ++++++++-------- net/core/stream.c | 16 ++++++++-------- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 2dccd4e..5574a5d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -86,7 +86,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); - prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); @@ -115,7 +115,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) error = 0; *timeo_p = schedule_timeout(*timeo_p); out: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return error; interrupted: error = sock_intr_errno(*timeo_p); @@ -726,7 +726,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask; - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* exceptional events? */ diff --git a/net/core/sock.c b/net/core/sock.c index 7effa1e..58ebd14 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1395,7 +1395,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) break; if (sk->sk_shutdown & SEND_SHUTDOWN) @@ -1404,7 +1404,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; timeo = schedule_timeout(timeo); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return timeo; } @@ -1570,11 +1570,11 @@ int sk_wait_data(struct sock *sk, long *timeo) int rc; DEFINE_WAIT(wait); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); @@ -1798,7 +1798,7 @@ static void sock_def_wakeup(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (sk_has_sleeper(sk)) - wake_up_interruptible_all(sk->sk_sleep); + wake_up_interruptible_all(sk_sleep(sk)); read_unlock(&sk->sk_callback_lock); } @@ -1806,7 +1806,7 @@ static void sock_def_error_report(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (sk_has_sleeper(sk)) - wake_up_interruptible_poll(sk->sk_sleep, POLLERR); + wake_up_interruptible_poll(sk_sleep(sk), POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); } @@ -1815,7 +1815,7 @@ static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&sk->sk_callback_lock); @@ -1830,7 +1830,7 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | + wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ diff --git a/net/core/stream.c b/net/core/stream.c index a37debf..7b3c3f3 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,8 +32,8 @@ void sk_stream_write_space(struct sock *sk) if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | + if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) + wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND); if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); @@ -66,13 +66,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); return 0; @@ -96,13 +96,13 @@ void sk_stream_wait_close(struct sock *sk, long timeout) DEFINE_WAIT(wait); do { - prepare_to_wait(sk->sk_sleep, &wait, + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) break; } while (!signal_pending(current) && timeout); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); } } @@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) while (1) { set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; @@ -157,7 +157,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; do_error: -- cgit v1.1 From ccb7c7732e2ceb4e81a7806faf1670be9681ccd2 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Tue, 20 Apr 2010 22:39:53 -0700 Subject: net: Remove two unnecessary exports (skbuff). There is no need to export skb_under_panic() and skb_over_panic() in skbuff.c, since these methods are used only in skbuff.c ; this patch removes these two exports. It also marks these functions as 'static' and removeS the extern declarations of them from include/linux/skbuff.h Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bdea0ef..4218ff4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -117,7 +117,7 @@ static const struct pipe_buf_operations sock_pipe_buf_ops = { * * Out of line support code for skb_put(). Not user callable. */ -void skb_over_panic(struct sk_buff *skb, int sz, void *here) +static void skb_over_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " "data:%p tail:%#lx end:%#lx dev:%s\n", @@ -126,7 +126,6 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : ""); BUG(); } -EXPORT_SYMBOL(skb_over_panic); /** * skb_under_panic - private function @@ -137,7 +136,7 @@ EXPORT_SYMBOL(skb_over_panic); * Out of line support code for skb_push(). Not user callable. */ -void skb_under_panic(struct sk_buff *skb, int sz, void *here) +static void skb_under_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " "data:%p tail:%#lx end:%#lx dev:%s\n", @@ -146,7 +145,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : ""); BUG(); } -EXPORT_SYMBOL(skb_under_panic); /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the -- cgit v1.1 From 9a20e3197e7f6097897c6d1f18335a326ee06299 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 20:08:36 +0000 Subject: net: Introduce skb_orphan_try() At this point, skb->destructor is not the original one (stored in DEV_GSO_CB(skb)->destructor) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e904c47..9bf1ccc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1937,7 +1937,6 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); - skb_orphan_try(nskb); rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) -- cgit v1.1 From e326bed2f47d0365da5a8faaf8ee93ed2d86325b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Apr 2010 00:22:45 -0700 Subject: rps: immediate send IPI in process_backlog() If some skb are queued to our backlog, we are delaying IPI sending at the end of net_rx_action(), increasing latencies. This defeats the queueing, since we want to quickly dispatch packets to the pool of worker cpus, then eventually deeply process our packets. It's better to send IPI before processing our packets in upper layers, from process_backlog(). Change the _and_disable_irq suffix to _and_enable_irq(), since we enable local irq in net_rps_action(), sorry for the confusion. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 76 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 34 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9bf1ccc..3ba774b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3242,11 +3242,48 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + __smp_call_function_single(remsd->cpu, + &remsd->csd, 0); + remsd = next; + } + } else +#endif + local_irq_enable(); +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = &__get_cpu_var(softnet_data); +#ifdef CONFIG_RPS + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd->rps_ipi_list) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } +#endif napi->weight = weight_p; do { struct sk_buff *skb; @@ -3353,45 +3390,16 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); -/* - * net_rps_action sends any pending IPI's for rps. - * Note: called with local irq disabled, but exits with local irq enabled. - */ -static void net_rps_action_and_irq_disable(void) -{ -#ifdef CONFIG_RPS - struct softnet_data *sd = &__get_cpu_var(softnet_data); - struct softnet_data *remsd = sd->rps_ipi_list; - - if (remsd) { - sd->rps_ipi_list = NULL; - - local_irq_enable(); - - /* Send pending IPI's to kick RPS processing on remote cpus. */ - while (remsd) { - struct softnet_data *next = remsd->rps_ipi_next; - - if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); - remsd = next; - } - } else -#endif - local_irq_enable(); -} - static void net_rx_action(struct softirq_action *h) { - struct list_head *list = &__get_cpu_var(softnet_data).poll_list; + struct softnet_data *sd = &__get_cpu_var(softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; local_irq_disable(); - while (!list_empty(list)) { + while (!list_empty(&sd->poll_list)) { struct napi_struct *n; int work, weight; @@ -3409,7 +3417,7 @@ static void net_rx_action(struct softirq_action *h) * entries to the tail of this list, and only ->poll() * calls can remove this head entry from the list. */ - n = list_first_entry(list, struct napi_struct, poll_list); + n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); @@ -3444,13 +3452,13 @@ static void net_rx_action(struct softirq_action *h) napi_complete(n); local_irq_disable(); } else - list_move_tail(&n->poll_list, list); + list_move_tail(&n->poll_list, &sd->poll_list); } netpoll_poll_unlock(have); } out: - net_rps_action_and_irq_disable(); + net_rps_action_and_irq_enable(sd); #ifdef CONFIG_NET_DMA /* -- cgit v1.1 From 9ccb8975940c4ee51161152e37058e3d9e06c62f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 22 Apr 2010 01:02:07 -0700 Subject: net: Orphan and de-dst skbs earlier in xmit path. This way GSO packets don't get handled differently. With help from Eric Dumazet. Signed-off-by: David S. Miller Signed-off-by: Eric Dumazet --- net/core/dev.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3ba774b..a4a7c36 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1902,13 +1902,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); - if (netif_needs_gso(dev, skb)) { - if (unlikely(dev_gso_segment(skb))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } - /* * If device doesnt need skb->dst, release it right now while * its hot in this cpu cache @@ -1917,6 +1910,14 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); skb_orphan_try(skb); + + if (netif_needs_gso(dev, skb)) { + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } + rc = ops->ndo_start_xmit(skb, dev); if (rc == NETDEV_TX_OK) txq_trans_update(txq); -- cgit v1.1 From 40eaf96271526a9f71030dd1a199ce46c045752e Mon Sep 17 00:00:00 2001 From: Paul LeoNerd Evans Date: Thu, 22 Apr 2010 03:32:22 +0000 Subject: net: Socket filter ancilliary data access for skb->dev->type Add an SKF_AD_HATYPE field to the packet ancilliary data area, giving access to skb->dev->type, as reported in the sll_hatype field. When capturing packets on a PF_PACKET/SOCK_RAW socket bound to all interfaces, there doesn't appear to be a way for the filter program to actually find out the underlying hardware type the packet was captured on. This patch adds such ability. This patch also handles the case where skb->dev can be NULL, such as on netlink sockets. Signed-off-by: Paul Evans Signed-off-by: David S. Miller --- net/core/filter.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index ff943be..da69fb7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -302,6 +302,8 @@ load_b: A = skb->pkt_type; continue; case SKF_AD_IFINDEX: + if (!skb->dev) + return 0; A = skb->dev->ifindex; continue; case SKF_AD_MARK: @@ -310,6 +312,11 @@ load_b: case SKF_AD_QUEUE: A = skb->queue_mapping; continue; + case SKF_AD_HATYPE: + if (!skb->dev) + return 0; + A = skb->dev->type; + continue; case SKF_AD_NLATTR: { struct nlattr *nla; -- cgit v1.1 From 8c52d509e84bbf26cffb8b6e75b399689af67885 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 24 Apr 2010 22:50:10 -0700 Subject: rps: optimize rps_get_cpu() optimize rps_get_cpu(). don't initialize ports when we can get the ports. one memory access for ports than two. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a4a7c36..4d43f1a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2229,7 +2229,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, int cpu = -1; u8 ip_proto; u16 tcpu; - u32 addr1, addr2, ports, ihl; + u32 addr1, addr2, ihl; + union { + u32 v32; + u16 v16[2]; + } ports; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); @@ -2275,7 +2279,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, default: goto done; } - ports = 0; switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -2285,25 +2288,20 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, case IPPROTO_SCTP: case IPPROTO_UDPLITE: if (pskb_may_pull(skb, (ihl * 4) + 4)) { - __be16 *hports = (__be16 *) (skb->data + (ihl * 4)); - u32 sport, dport; - - sport = (__force u16) hports[0]; - dport = (__force u16) hports[1]; - if (dport < sport) - swap(sport, dport); - ports = (sport << 16) + dport; + ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + if (ports.v16[1] < ports.v16[0]) + swap(ports.v16[0], ports.v16[1]); + break; } - break; - default: + ports.v32 = 0; break; } /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd); + skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); if (!skb->rxhash) skb->rxhash = 1; -- cgit v1.1 From b3c981d2bbbe889125169bd0bb482e64d3c028a1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 25 Apr 2010 00:49:56 -0700 Subject: netns: rename unregister_pernet_subsys parameter Stay consistent with other functions and with comment also and name pernet_operations parameter properly. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bd8c471..69a20bf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -469,10 +469,10 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); * addition run the exit method for all existing network * namespaces. */ -void unregister_pernet_subsys(struct pernet_operations *module) +void unregister_pernet_subsys(struct pernet_operations *ops) { mutex_lock(&net_mutex); - unregister_pernet_operations(module); + unregister_pernet_operations(ops); mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); -- cgit v1.1 From 3d0c9c4eb2dbdcc461be4084abd87a9a9e70f713 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 26 Apr 2010 16:02:04 +0200 Subject: net: fib_rules: mark arguments to fib_rules_register const and __net_initdata fib_rules_register() duplicates the template passed to it without modification, mark the argument as const. Additionally the templates are only needed when instantiating a new namespace, so mark them as __net_initdata, which means they can be discarded when CONFIG_NET_NS=n. Signed-off-by: Patrick McHardy --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1bc6659..42e84e0 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -122,7 +122,7 @@ errout: } struct fib_rules_ops * -fib_rules_register(struct fib_rules_ops *tmpl, struct net *net) +fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; -- cgit v1.1 From 25239cee7e8732dbdc9f5d324f1c22a3bdec1d1f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 26 Apr 2010 16:02:05 +0200 Subject: net: rtnetlink: decouple rtnetlink address families from real address families Decouple rtnetlink address families from real address families in socket.h to be able to add rtnetlink interfaces to code that is not a real address family without increasing AF_MAX/NPROTO. This will be used to add support for multicast route dumping from all tables as the proc interface can't be extended to support anything but the main table without breaking compatibility. This partialy undoes the patch to introduce independant families for routing rules and converts ipmr routing rules to a new rtnetlink family. Similar to that patch, values up to 127 are reserved for real address families, values above that may be used arbitrarily. Signed-off-by: Patrick McHardy --- net/core/rtnetlink.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 78c85985c..fd781b6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -98,7 +98,7 @@ int lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link *rtnl_msg_handlers[NPROTO]; +static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -118,7 +118,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) { struct rtnl_link *tab; - if (protocol < NPROTO) + if (protocol <= RTNL_FAMILY_MAX) tab = rtnl_msg_handlers[protocol]; else tab = NULL; @@ -133,7 +133,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) { struct rtnl_link *tab; - if (protocol < NPROTO) + if (protocol <= RTNL_FAMILY_MAX) tab = rtnl_msg_handlers[protocol]; else tab = NULL; @@ -167,7 +167,7 @@ int __rtnl_register(int protocol, int msgtype, struct rtnl_link *tab; int msgindex; - BUG_ON(protocol < 0 || protocol >= NPROTO); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); tab = rtnl_msg_handlers[protocol]; @@ -219,7 +219,7 @@ int rtnl_unregister(int protocol, int msgtype) { int msgindex; - BUG_ON(protocol < 0 || protocol >= NPROTO); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); if (rtnl_msg_handlers[protocol] == NULL) @@ -241,7 +241,7 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - BUG_ON(protocol < 0 || protocol >= NPROTO); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); kfree(rtnl_msg_handlers[protocol]); rtnl_msg_handlers[protocol] = NULL; @@ -1384,7 +1384,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (s_idx == 0) s_idx = 1; - for (idx = 1; idx < NPROTO; idx++) { + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { int type = cb->nlh->nlmsg_type-RTM_BASE; if (idx < s_idx || idx == PF_PACKET) continue; -- cgit v1.1 From a9cbd588fdb71ea415754c885e2f9f03e6bf1ba0 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 26 Apr 2010 23:06:24 +0000 Subject: net: reimplement softnet_data.output_queue as a FIFO queue reimplement softnet_data.output_queue as a FIFO queue to keep the fairness among the qdiscs rescheduled. Signed-off-by: Changli Gao Acked-by: Eric Dumazet ---- include/linux/netdevice.h | 1 + net/core/dev.c | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4d43f1a..3d31491 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1557,8 +1557,9 @@ static inline void __netif_reschedule(struct Qdisc *q) local_irq_save(flags); sd = &__get_cpu_var(softnet_data); - q->next_sched = sd->output_queue; - sd->output_queue = q; + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } @@ -2529,6 +2530,7 @@ static void net_tx_action(struct softirq_action *h) local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); while (head) { @@ -5594,7 +5596,6 @@ static int dev_cpu_callback(struct notifier_block *nfb, void *ocpu) { struct sk_buff **list_skb; - struct Qdisc **list_net; struct sk_buff *skb; unsigned int cpu, oldcpu = (unsigned long)ocpu; struct softnet_data *sd, *oldsd; @@ -5615,13 +5616,13 @@ static int dev_cpu_callback(struct notifier_block *nfb, *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; - /* Find end of our output_queue. */ - list_net = &sd->output_queue; - while (*list_net) - list_net = &(*list_net)->next_sched; /* Append output queue from offline CPU. */ - *list_net = oldsd->output_queue; - oldsd->output_queue = NULL; + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); @@ -5851,7 +5852,8 @@ static int __init net_dev_init(void) skb_queue_head_init(&sd->input_pkt_queue); sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list); - + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; -- cgit v1.1 From 6e7676c1a76aed6e957611d8d7a9e5592e23aeba Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 27 Apr 2010 15:07:33 -0700 Subject: net: batch skb dequeueing from softnet input_pkt_queue batch skb dequeueing from softnet input_pkt_queue to reduce potential lock contention when RPS is enabled. Note: in the worst case, the number of packets in a softnet_data may be double of netdev_max_backlog. Signed-off-by: Changli Gao Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 57 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3d31491..100dcbd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2408,12 +2408,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, __get_cpu_var(netdev_rx_stat).total++; rps_lock(sd); - if (sd->input_pkt_queue.qlen <= netdev_max_backlog) { - if (sd->input_pkt_queue.qlen) { + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); #ifdef CONFIG_RPS - *qtail = sd->input_queue_head + sd->input_pkt_queue.qlen; + *qtail = sd->input_queue_head + + skb_queue_len(&sd->input_pkt_queue); #endif rps_unlock(sd); local_irq_restore(flags); @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg) struct sk_buff *skb, *tmp; rps_lock(sd); - skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev == dev) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); - input_queue_head_incr(sd); + input_queue_head_add(sd, 1); } + } rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + } + } } static int napi_gro_complete(struct sk_buff *skb) @@ -3286,24 +3295,33 @@ static int process_backlog(struct napi_struct *napi, int quota) } #endif napi->weight = weight_p; - do { + local_irq_disable(); + while (work < quota) { struct sk_buff *skb; + unsigned int qlen; + + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); + __netif_receive_skb(skb); + if (++work >= quota) + return work; + local_irq_disable(); + } - local_irq_disable(); rps_lock(sd); - skb = __skb_dequeue(&sd->input_pkt_queue); - if (!skb) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen) { + input_queue_head_add(sd, qlen); + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + } + if (qlen < quota - work) { __napi_complete(napi); - rps_unlock(sd); - local_irq_enable(); - break; + quota = work + qlen; } - input_queue_head_incr(sd); rps_unlock(sd); - local_irq_enable(); - - __netif_receive_skb(skb); - } while (++work < quota); + } + local_irq_enable(); return work; } @@ -5630,8 +5648,10 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + input_queue_head_add(oldsd, 1); } + while ((skb = __skb_dequeue(&oldsd->process_queue))) + netif_rx(skb); return NOTIFY_OK; } @@ -5850,6 +5870,7 @@ static int __init net_dev_init(void) struct softnet_data *sd = &per_cpu(softnet_data, i); skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); sd->completion_queue = NULL; INIT_LIST_HEAD(&sd->poll_list); sd->output_queue = NULL; -- cgit v1.1 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 58ebd14..5104175 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -327,6 +327,10 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; + if (sk_rcvqueues_full(sk, skb)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } if (nested) bh_lock_sock_nested(sk); else @@ -1885,7 +1889,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; - sk->sk_backlog.limit = sk->sk_rcvbuf << 1; sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); -- cgit v1.1 From 05fceb4ad7e8bf809a2a97061d6273d27d1a8449 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 23 Apr 2010 01:40:47 +0000 Subject: net: disallow to use net_assign_generic externally Now there's no need to use this fuction directly because it's handled by register_pernet_device. So to make this simple and easy to understand, make this static to do not tempt potentional users. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/net_namespace.c | 91 ++++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 46 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 69a20bf..c988e68 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -27,6 +27,51 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static void net_generic_release(struct rcu_head *rcu) +{ + struct net_generic *ng; + + ng = container_of(rcu, struct net_generic, rcu); + kfree(ng); +} + +static int net_assign_generic(struct net *net, int id, void *data) +{ + struct net_generic *ng, *old_ng; + + BUG_ON(!mutex_is_locked(&net_mutex)); + BUG_ON(id == 0); + + ng = old_ng = net->gen; + if (old_ng->len >= id) + goto assign; + + ng = kzalloc(sizeof(struct net_generic) + + id * sizeof(void *), GFP_KERNEL); + if (ng == NULL) + return -ENOMEM; + + /* + * Some synchronisation notes: + * + * The net_generic explores the net->gen array inside rcu + * read section. Besides once set the net->gen->ptr[x] + * pointer never changes (see rules in netns/generic.h). + * + * That said, we simply duplicate this array and schedule + * the old copy for kfree after a grace period. + */ + + ng->len = id; + memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + + rcu_assign_pointer(net->gen, ng); + call_rcu(&old_ng->rcu, net_generic_release); +assign: + ng->ptr[id - 1] = data; + return 0; +} + static int ops_init(const struct pernet_operations *ops, struct net *net) { int err; @@ -526,49 +571,3 @@ void unregister_pernet_device(struct pernet_operations *ops) mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(unregister_pernet_device); - -static void net_generic_release(struct rcu_head *rcu) -{ - struct net_generic *ng; - - ng = container_of(rcu, struct net_generic, rcu); - kfree(ng); -} - -int net_assign_generic(struct net *net, int id, void *data) -{ - struct net_generic *ng, *old_ng; - - BUG_ON(!mutex_is_locked(&net_mutex)); - BUG_ON(id == 0); - - ng = old_ng = net->gen; - if (old_ng->len >= id) - goto assign; - - ng = kzalloc(sizeof(struct net_generic) + - id * sizeof(void *), GFP_KERNEL); - if (ng == NULL) - return -ENOMEM; - - /* - * Some synchronisation notes: - * - * The net_generic explores the net->gen array inside rcu - * read section. Besides once set the net->gen->ptr[x] - * pointer never changes (see rules in netns/generic.h). - * - * That said, we simply duplicate this array and schedule - * the old copy for kfree after a grace period. - */ - - ng->len = id; - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); - - rcu_assign_pointer(net->gen, ng); - call_rcu(&old_ng->rcu, net_generic_release); -assign: - ng->ptr[id - 1] = data; - return 0; -} -EXPORT_SYMBOL_GPL(net_assign_generic); -- cgit v1.1 From 4b0b72f7dd617b13abd1b04c947e15873e011a24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 14:35:48 -0700 Subject: net: speedup udp receive path Since commit 95766fff ([UDP]: Add memory accounting.), each received packet needs one extra sock_lock()/sock_release() pair. This added latency because of possible backlog handling. Then later, ticket spinlocks added yet another latency source in case of DDOS. This patch introduces lock_sock_bh() and unlock_sock_bh() synchronization primitives, avoiding one atomic operation and backlog processing. skb_free_datagram_locked() uses them instead of full blown lock_sock()/release_sock(). skb is orphaned inside locked section for proper socket memory reclaim, and finally freed outside of it. UDP receive path now take the socket spinlock only once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 5574a5d..95b851f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -229,9 +229,13 @@ EXPORT_SYMBOL(skb_free_datagram); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + lock_sock_bh(sk); + skb_orphan(skb); + sk_mem_reclaim_partial(sk); + unlock_sock_bh(sk); + + /* skb is now orphaned, might be freed outside of locked section */ + consume_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); -- cgit v1.1 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 50 +++++++++++++++++++++++++++++++------------------- net/core/stream.c | 10 +++++++--- 2 files changed, 38 insertions(+), 22 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5104175..94c4aff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1211,7 +1211,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); - newsk->sk_sleep = NULL; + newsk->sk_wq = NULL; if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); @@ -1800,41 +1800,53 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_all(sk_sleep(sk)); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_poll(sk_sleep(sk), POLLERR); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_readable(struct sock *sk, int len) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN | + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT | + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ @@ -1842,7 +1854,7 @@ static void sock_def_write_space(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_destruct(struct sock *sk) @@ -1896,10 +1908,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - sk->sk_sleep = &sock->wait; + sk->sk_wq = sock->wq; sock->sk = sk; } else - sk->sk_sleep = NULL; + sk->sk_wq = NULL; spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); diff --git a/net/core/stream.c b/net/core/stream.c index 7b3c3f3..cc196f4 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -28,15 +28,19 @@ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; + struct socket_wq *wq; if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); - if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); } } -- cgit v1.1 From 47d29646a2c1c147d8a7598aeac2c87dd71ed638 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 2 May 2010 02:21:44 -0700 Subject: net: Inline skb_pull() in eth_type_trans(). In commit 6be8ac2f ("[NET]: uninline skb_pull, de-bloats a lot") we uninlined skb_pull. But in some critical paths it makes sense to inline this thing and it helps performance significantly. Create an skb_pull_inline() so that we can do this in a way that serves also as annotation. Based upon a patch by Eric Dumazet. Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4218ff4..8b9c109 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1051,7 +1051,7 @@ EXPORT_SYMBOL(skb_push); */ unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) { - return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); + return skb_pull_inline(skb, len); } EXPORT_SYMBOL(skb_pull); -- cgit v1.1 From dee42870a423ad485129f43cddfe7275479f11d8 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 2 May 2010 05:42:16 +0000 Subject: net: fix softnet_stat Per cpu variable softnet_data.total was shared between IRQ and SoftIRQ context without any protection. And enqueue_to_backlog should update the netdev_rx_stat of the target CPU. This patch renames softnet_data.total to softnet_data.processed: the number of packets processed in uppper levels(IP stacks). softnet_stat data is moved into softnet_data. Signed-off-by: Changli Gao ---- include/linux/netdevice.h | 17 +++++++---------- net/core/dev.c | 26 ++++++++++++-------------- net/sched/sch_generic.c | 2 +- 3 files changed, 20 insertions(+), 25 deletions(-) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 100dcbd..36d53be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2205,8 +2205,6 @@ int netdev_max_backlog __read_mostly = 1000; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; - #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ @@ -2366,7 +2364,7 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; __napi_schedule(&sd->backlog); - __get_cpu_var(netdev_rx_stat).received_rps++; + sd->received_rps++; } #endif /* CONFIG_RPS */ @@ -2405,7 +2403,6 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); - __get_cpu_var(netdev_rx_stat).total++; rps_lock(sd); if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { @@ -2429,9 +2426,9 @@ enqueue: goto enqueue; } + sd->dropped++; rps_unlock(sd); - __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); kfree_skb(skb); @@ -2806,7 +2803,7 @@ static int __netif_receive_skb(struct sk_buff *skb) skb->dev = master; } - __get_cpu_var(netdev_rx_stat).total++; + __get_cpu_var(softnet_data).processed++; skb_reset_network_header(skb); skb_reset_transport_header(skb); @@ -3490,7 +3487,7 @@ out: return; softnet_break: - __get_cpu_var(netdev_rx_stat).time_squeeze++; + sd->time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } @@ -3691,17 +3688,17 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } -static struct netif_rx_stats *softnet_get_online(loff_t *pos) +static struct softnet_data *softnet_get_online(loff_t *pos) { - struct netif_rx_stats *rc = NULL; + struct softnet_data *sd = NULL; while (*pos < nr_cpu_ids) if (cpu_online(*pos)) { - rc = &per_cpu(netdev_rx_stat, *pos); + sd = &per_cpu(softnet_data, *pos); break; } else ++*pos; - return rc; + return sd; } static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) @@ -3721,12 +3718,12 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { - struct netif_rx_stats *s = v; + struct softnet_data *sd = v; seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, 0, + sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision, s->received_rps); + sd->cpu_collision, sd->received_rps); return 0; } @@ -5869,6 +5866,7 @@ static int __init net_dev_init(void) for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); + memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); sd->completion_queue = NULL; -- cgit v1.1 From 93bb64eac10aad3dae6178d7da94765f207d121f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 3 May 2010 23:18:14 -0700 Subject: net: skb_free_datagram_locked() fix Commit 4b0b72f7dd617b ( net: speedup udp receive path ) introduced a bug in skb_free_datagram_locked(). We should not skb_orphan() skb if we dont have the guarantee we are the last skb user, this might happen with MSG_PEEK concurrent users. To keep socket locked for the smallest period of time, we split consume_skb() logic, inlined in skb_free_datagram_locked() Reported-by: Stephen Hemminger Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 95b851f..e009753 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -229,13 +229,18 @@ EXPORT_SYMBOL(skb_free_datagram); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + lock_sock_bh(sk); skb_orphan(skb); sk_mem_reclaim_partial(sk); unlock_sock_bh(sk); - /* skb is now orphaned, might be freed outside of locked section */ - consume_skb(skb); + /* skb is now orphaned, can be freed outside of locked section */ + __kfree_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); -- cgit v1.1 From ec7d2f2cf3a1b76202986519ec4f8ec75b2de232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 May 2010 01:07:37 -0700 Subject: net: __alloc_skb() speedup With following patch I can reach maximum rate of my pktgen+udpsink simulator : - 'old' machine : dual quad core E5450 @3.00GHz - 64 UDP rx flows (only differ by destination port) - RPS enabled, NIC interrupts serviced on cpu0 - rps dispatched on 7 other cores. (~130.000 IPI per second) - SLAB allocator (faster than SLUB in this workload) - tg3 NIC - 1.080.000 pps without a single drop at NIC level. Idea is to add two prefetchw() calls in __alloc_skb(), one to prefetch first sk_buff cache line, the second to prefetch the shinfo part. Also using one memset() to initialize all skb_shared_info fields instead of one by one to reduce number of instructions, using long word moves. All skb_shared_info fields before 'dataref' are cleared in __alloc_skb(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8b9c109..a9b0e1f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -181,12 +181,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + prefetchw(skb); size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); if (!data) goto nodata; + prefetchw(data + size); /* * Only clear those fields we need to clear, not those that we will @@ -208,15 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - shinfo->nr_frags = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->tx_flags.flags = 0; - skb_frag_list_init(skb); - memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { struct sk_buff *child = skb + 1; @@ -505,16 +500,10 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size) return 0; skb_release_head_state(skb); + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - shinfo->nr_frags = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->tx_flags.flags = 0; - skb_frag_list_init(skb); - memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); memset(skb, 0, offsetof(struct sk_buff, tail)); skb->data = skb->head + NET_SKB_PAD; -- cgit v1.1 From 0e34e93177fb1f642cab080e0bde664c06c7183a Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 6 May 2010 00:47:21 -0700 Subject: netpoll: add generic support for bridge and bonding devices This whole patchset is for adding netpoll support to bridge and bonding devices. I already tested it for bridge, bonding, bridge over bonding, and bonding over bridge. It looks fine now. To make bridge and bonding support netpoll, we need to adjust some netpoll generic code. This patch does the following things: 1) introduce two new priv_flags for struct net_device: IFF_IN_NETPOLL which identifies we are processing a netpoll; IFF_DISABLE_NETPOLL is used to disable netpoll support for a device at run-time; 2) introduce one new method for netdev_ops: ->ndo_netpoll_cleanup() is used to clean up netpoll when a device is removed. 3) introduce netpoll_poll_dev() which takes a struct net_device * parameter; export netpoll_send_skb() and netpoll_poll_dev() which will be used later; 4) hide a pointer to struct netpoll in struct netpoll_info, ditto. 5) introduce ->real_dev for struct netpoll. 6) introduce a new status NETDEV_BONDING_DESLAE, which is used to disable netconsole before releasing a slave, to avoid deadlocks. Cc: David Miller Cc: Neil Horman Signed-off-by: WANG Cong Signed-off-by: David S. Miller --- net/core/netpoll.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a58f59b..94825b1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -179,9 +179,8 @@ static void service_arp_queue(struct netpoll_info *npi) } } -void netpoll_poll(struct netpoll *np) +void netpoll_poll_dev(struct net_device *dev) { - struct net_device *dev = np->dev; const struct net_device_ops *ops; if (!dev || !netif_running(dev)) @@ -201,6 +200,11 @@ void netpoll_poll(struct netpoll *np) zap_completion_queue(); } +void netpoll_poll(struct netpoll *np) +{ + netpoll_poll_dev(np->dev); +} + static void refill_skbs(void) { struct sk_buff *skb; @@ -282,7 +286,7 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } -static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { int status = NETDEV_TX_BUSY; unsigned long tries; @@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) tries > 0; --tries) { if (__netif_tx_trylock(txq)) { if (!netif_tx_queue_stopped(txq)) { + dev->priv_flags |= IFF_IN_NETPOLL; status = ops->ndo_start_xmit(skb, dev); + dev->priv_flags &= ~IFF_IN_NETPOLL; if (status == NETDEV_TX_OK) txq_trans_update(txq); } @@ -756,7 +762,10 @@ int netpoll_setup(struct netpoll *np) atomic_inc(&npinfo->refcnt); } - if (!ndev->netdev_ops->ndo_poll_controller) { + npinfo->netpoll = np; + + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || + !ndev->netdev_ops->ndo_poll_controller) { printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", np->name, np->dev_name); err = -ENOTSUPP; @@ -878,6 +887,7 @@ void netpoll_cleanup(struct netpoll *np) } if (atomic_dec_and_test(&npinfo->refcnt)) { + const struct net_device_ops *ops; skb_queue_purge(&npinfo->arp_tx); skb_queue_purge(&npinfo->txq); cancel_rearming_delayed_work(&npinfo->tx_work); @@ -885,7 +895,11 @@ void netpoll_cleanup(struct netpoll *np) /* clean after last, unfinished work */ __skb_queue_purge(&npinfo->txq); kfree(npinfo); - np->dev->npinfo = NULL; + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_cleanup) + ops->ndo_netpoll_cleanup(np->dev); + else + np->dev->npinfo = NULL; } } @@ -908,6 +922,7 @@ void netpoll_set_trap(int trap) atomic_dec(&trapped); } +EXPORT_SYMBOL(netpoll_send_skb); EXPORT_SYMBOL(netpoll_set_trap); EXPORT_SYMBOL(netpoll_trap); EXPORT_SYMBOL(netpoll_print_options); @@ -915,4 +930,5 @@ EXPORT_SYMBOL(netpoll_parse_options); EXPORT_SYMBOL(netpoll_setup); EXPORT_SYMBOL(netpoll_cleanup); EXPORT_SYMBOL(netpoll_send_udp); +EXPORT_SYMBOL(netpoll_poll_dev); EXPORT_SYMBOL(netpoll_poll); -- cgit v1.1 From 6ec82562ffc6f297d0de36d65776cff8e5704867 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 May 2010 00:53:53 -0700 Subject: veth: Dont kfree_skb() after dev_forward_skb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of congestion, netif_rx() frees the skb, so we must assume dev_forward_skb() also consume skb. Bug introduced by commit 445409602c092 (veth: move loopback logic to common location) We must change dev_forward_skb() to always consume skb, and veth to not double free it. Bug report : http://marc.info/?l=linux-netdev&m=127310770900442&w=3 Reported-by: Martín Ferrari Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f769098..264137f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1451,7 +1451,7 @@ static inline void net_timestamp(struct sk_buff *skb) * * return values: * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) + * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue @@ -1465,12 +1465,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { skb_orphan(skb); - if (!(dev->flags & IFF_UP)) - return NET_RX_DROP; - - if (skb->len > (dev->mtu + dev->hard_header_len)) + if (!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len))) { + kfree_skb(skb); return NET_RX_DROP; - + } skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; -- cgit v1.1 From eecfd7c4e36ff532d895885971d01d049bd3e014 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 May 2010 22:07:48 -0700 Subject: rps: Various optimizations Introduce ____napi_schedule() helper for callers in irq disabled contexts. rps_trigger_softirq() becomes a leaf function. Use container_of() in process_backlog() instead of accessing per_cpu address. Use a custom inlined version of __napi_complete() in process_backlog() to avoid one locked instruction : only current cpu owns and manipulates this napi, and NAPI_STATE_SCHED is the only possible flag set on backlog. we can use a plain write instead of clear_bit(), and we dont need an smp_mb() memory barrier, since RPS is on, backlog is protected by a spinlock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 36d53be..32611c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2205,6 +2205,14 @@ int netdev_max_backlog __read_mostly = 1000; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ @@ -2363,7 +2371,7 @@ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; - __napi_schedule(&sd->backlog); + ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } @@ -2421,7 +2429,7 @@ enqueue: /* Schedule NAPI for backlog device */ if (napi_schedule_prep(&sd->backlog)) { if (!rps_ipi_queued(sd)) - __napi_schedule(&sd->backlog); + ____napi_schedule(sd, &sd->backlog); } goto enqueue; } @@ -3280,7 +3288,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); #ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, @@ -3313,7 +3321,16 @@ static int process_backlog(struct napi_struct *napi, int quota) &sd->process_queue); } if (qlen < quota - work) { - __napi_complete(napi); + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set on backlog. + * we can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + list_del(&napi->poll_list); + napi->state = 0; + quota = work + qlen; } rps_unlock(sd); @@ -3334,8 +3351,7 @@ void __napi_schedule(struct napi_struct *n) unsigned long flags; local_irq_save(flags); - list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + ____napi_schedule(&__get_cpu_var(softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); -- cgit v1.1 From a14462f1bd4d3962994f518459102000438665aa Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 6 May 2010 01:33:53 +0000 Subject: net: adjust handle_macvlan to pass port struct to hook Now there's null check here and also again in the hook. Looking at bridge bits which are simmilar, port structure is rcu_dereferenced right away in handle_bridge and passed to hook. Looks nicer. Signed-off-by: Jiri Pirko Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3daee30..5cbba09 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2612,7 +2612,8 @@ static inline struct sk_buff *handle_bridge(struct sk_buff *skb, #endif #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) -struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; +struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p, + struct sk_buff *skb) __read_mostly; EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, @@ -2620,14 +2621,17 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, int *ret, struct net_device *orig_dev) { - if (skb->dev->macvlan_port == NULL) + struct macvlan_port *port; + + port = rcu_dereference(skb->dev->macvlan_port); + if (!port) return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - return macvlan_handle_frame_hook(skb); + return macvlan_handle_frame_hook(port, skb); } #else #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) -- cgit v1.1 From 3b098e2d7c693796cc4dffb07caa249fc0f70771 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 May 2010 23:57:10 -0700 Subject: net: Consistent skb timestamping With RPS inclusion, skb timestamping is not consistent in RX path. If netif_receive_skb() is used, its deferred after RPS dispatch. If netif_rx() is used, its done before RPS dispatch. This can give strange tcpdump timestamps results. I think timestamping should be done as soon as possible in the receive path, to get meaningful values (ie timestamps taken at the time packet was delivered by NIC driver to our stack), even if NAPI already can defer timestamping a bit (RPS can help to reduce the gap) Tom Herbert prefer to sample timestamps after RPS dispatch. In case sampling is expensive (HPET/acpi_pm on x86), this makes sense. Let admins switch from one mode to another, using a new sysctl, /proc/sys/net/core/netdev_tstamp_prequeue Its default value (1), means timestamps are taken as soon as possible, before backlog queueing, giving accurate timestamps. Setting a 0 value permits to sample timestamps when processing backlog, after RPS dispatch, to lower the load of the pre-RPS cpu. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 50 ++++++++++++++++++++++++++++------------------ net/core/sysctl_net_core.c | 7 +++++++ 2 files changed, 38 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5cbba09..988e429 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1454,7 +1454,7 @@ void net_disable_timestamp(void) } EXPORT_SYMBOL(net_disable_timestamp); -static inline void net_timestamp(struct sk_buff *skb) +static inline void net_timestamp_set(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) __net_timestamp(skb); @@ -1462,6 +1462,12 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +static inline void net_timestamp_check(struct sk_buff *skb) +{ + if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) + __net_timestamp(skb); +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1508,9 +1514,9 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) #ifdef CONFIG_NET_CLS_ACT if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) - net_timestamp(skb); + net_timestamp_set(skb); #else - net_timestamp(skb); + net_timestamp_set(skb); #endif rcu_read_lock(); @@ -2201,6 +2207,7 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ @@ -2465,8 +2472,8 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); #ifdef CONFIG_RPS { @@ -2791,8 +2798,8 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (!netdev_tstamp_prequeue) + net_timestamp_check(skb); if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; @@ -2910,23 +2917,28 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); + #ifdef CONFIG_RPS - struct rps_dev_flow voidflow, *rflow = &voidflow; - int cpu, ret; + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; - rcu_read_lock(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); - cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + ret = __netif_receive_skb(skb); + } - if (cpu >= 0) { - ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); - rcu_read_unlock(); - } else { - rcu_read_unlock(); - ret = __netif_receive_skb(skb); + return ret; } - - return ret; #else return __netif_receive_skb(skb); #endif diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index dcc7d25..01eee5d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -122,6 +122,13 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { + .procname = "netdev_tstamp_prequeue", + .data = &netdev_tstamp_prequeue, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), -- cgit v1.1 From a465419b1febb603821f924805529cff89cafeed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 May 2010 00:36:33 -0700 Subject: net: Introduce sk_route_nocaps TCP-MD5 sessions have intermittent failures, when route cache is invalidated. ip_queue_xmit() has to find a new route, calls sk_setup_caps(sk, &rt->u.dst), destroying the sk->sk_route_caps &= ~NETIF_F_GSO_MASK that MD5 desperately try to make all over its way (from tcp_transmit_skb() for example) So we send few bad packets, and everything is fine when tcp_transmit_skb() is called again for this socket. Since ip_queue_xmit() is at a lower level than TCP-MD5, I chose to use a socket field, sk_route_nocaps, containing bits to mask on sk_route_caps. Reported-by: Bhaskar Dutta Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 94c4aff..63530a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1231,6 +1231,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + sk->sk_route_caps &= ~sk->sk_route_nocaps; if (sk_can_gso(sk)) { if (dst->header_len) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; -- cgit v1.1 From c02db8c6290bb992442fec1407643c94cc414375 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sun, 16 May 2010 01:05:45 -0700 Subject: rtnetlink: make SR-IOV VF interface symmetric Now we have a set of nested attributes: IFLA_VFINFO_LIST (NESTED) IFLA_VF_INFO (NESTED) IFLA_VF_MAC IFLA_VF_VLAN IFLA_VF_TX_RATE This allows a single set to operate on multiple attributes if desired. Among other things, it means a dump can be replayed to set state. The current interface has yet to be released, so this seems like something to consider for 2.6.34. Signed-off-by: Chris Wright Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 159 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 110 insertions(+), 49 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fe776c9..31e85d3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -602,12 +602,19 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->tx_compressed = b->tx_compressed; }; +/* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent)) - return dev_num_vf(dev->dev.parent) * - sizeof(struct ifla_vf_info); - else + if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { + + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nlmsg_total_size(sizeof(struct nlattr)); + size += nlmsg_total_size(num_vfs * sizeof(struct nlattr)); + size += num_vfs * (sizeof(struct ifla_vf_mac) + + sizeof(struct ifla_vf_vlan) + + sizeof(struct ifla_vf_tx_rate)); + return size; + } else return 0; } @@ -629,7 +636,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_NUM_VF */ - + nla_total_size(rtnl_vfinfo_size(dev)) /* IFLA_VFINFO */ + + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ } @@ -700,14 +707,37 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { int i; - struct ifla_vf_info ivi; - NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); - for (i = 0; i < dev_num_vf(dev->dev.parent); i++) { + struct nlattr *vfinfo, *vf; + int num_vfs = dev_num_vf(dev->dev.parent); + + NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs); + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + goto nla_put_failure; + for (i = 0; i < num_vfs; i++) { + struct ifla_vf_info ivi; + struct ifla_vf_mac vf_mac; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_tx_rate vf_tx_rate; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; - NLA_PUT(skb, IFLA_VFINFO, sizeof(ivi), &ivi); + vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.tx_rate; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); + NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); + nla_nest_end(skb, vf); } + nla_nest_end(skb, vfinfo); } if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) @@ -769,12 +799,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, - [IFLA_VF_MAC] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_mac) }, - [IFLA_VF_VLAN] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_vlan) }, - [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_tx_rate) }, + [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, }; EXPORT_SYMBOL(ifla_policy); @@ -783,6 +808,19 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; +static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { + [IFLA_VF_INFO] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { + [IFLA_VF_MAC] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_VLAN] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_tx_rate) }, +}; + struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; @@ -812,6 +850,52 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return 0; } +static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +{ + int rem, err = -EINVAL; + struct nlattr *vf; + const struct net_device_ops *ops = dev->netdev_ops; + + nla_for_each_nested(vf, attr, rem) { + switch (nla_type(vf)) { + case IFLA_VF_MAC: { + struct ifla_vf_mac *ivm; + ivm = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + break; + } + case IFLA_VF_VLAN: { + struct ifla_vf_vlan *ivv; + ivv = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, + ivv->vlan, + ivv->qos); + break; + } + case IFLA_VF_TX_RATE: { + struct ifla_vf_tx_rate *ivt; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_tx_rate) + err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, + ivt->rate); + break; + } + default: + err = -EINVAL; + break; + } + if (err) + break; + } + return err; +} + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { @@ -942,40 +1026,17 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, write_unlock_bh(&dev_base_lock); } - if (tb[IFLA_VF_MAC]) { - struct ifla_vf_mac *ivm; - ivm = nla_data(tb[IFLA_VF_MAC]); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_mac) - err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); - if (err < 0) - goto errout; - modified = 1; - } - - if (tb[IFLA_VF_VLAN]) { - struct ifla_vf_vlan *ivv; - ivv = nla_data(tb[IFLA_VF_VLAN]); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_vlan) - err = ops->ndo_set_vf_vlan(dev, ivv->vf, - ivv->vlan, - ivv->qos); - if (err < 0) - goto errout; - modified = 1; - } - err = 0; - - if (tb[IFLA_VF_TX_RATE]) { - struct ifla_vf_tx_rate *ivt; - ivt = nla_data(tb[IFLA_VF_TX_RATE]); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_tx_rate) - err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ivt->rate); - if (err < 0) - goto errout; - modified = 1; + if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *attr; + int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { + if (nla_type(attr) != IFLA_VF_INFO) + goto errout; + err = do_setvfinfo(dev, attr); + if (err < 0) + goto errout; + modified = 1; + } } err = 0; -- cgit v1.1 From ebda37c27d0c768947e9b058332d7ea798210cf8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 May 2010 23:51:21 +0000 Subject: rps: avoid one atomic in enqueue_to_backlog If CONFIG_SMP=y, then we own a queue spinlock, we can avoid the atomic test_and_set_bit() from napi_schedule_prep(). We now have same number of atomic ops per netif_rx() calls than with pre-RPS kernel. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 988e429..cdcb9cb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2432,8 +2432,10 @@ enqueue: return NET_RX_SUCCESS; } - /* Schedule NAPI for backlog device */ - if (napi_schedule_prep(&sd->backlog)) { + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { if (!rps_ipi_queued(sd)) ____napi_schedule(sd, &sd->backlog); } -- cgit v1.1 From 7fee226ad2397b635e2fd565a59ca3ae08a164cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 May 2010 23:19:48 +0000 Subject: net: add a noref bit on skb dst Use low order bit of skb->_skb_dst to tell dst is not refcounted. Change _skb_dst to _skb_refdst to make sure all uses are catched. skb_dst() returns the dst, regardless of noref bit set or not, but with a lockdep check to make sure a noref dst is not given if current user is not rcu protected. New skb_dst_set_noref() helper to set an notrefcounted dst on a skb. (with lockdep check) skb_dst_drop() drops a reference only if skb dst was refcounted. skb_dst_force() helper is used to force a refcount on dst, when skb is queued and not anymore RCU protected. Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if !IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in sock_queue_rcv_skb(), in __nf_queue(). Use skb_dst_force() in dev_requeue_skb(). Note: dst_use_noref() still dirties dst, we might transform it later to do one dirtying per jiffies. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ net/core/skbuff.c | 2 +- net/core/sock.c | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cdcb9cb..6c82065 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2052,6 +2052,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); __qdisc_update_bstats(q, skb->len); if (sch_direct_xmit(skb, q, dev, txq, root_lock)) __qdisc_run(q); @@ -2060,6 +2062,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { + skb_dst_force(skb); rc = qdisc_enqueue_root(skb, q); qdisc_run(q); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a9b0e1f..c543dd2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -520,7 +520,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; - skb_dst_set(new, dst_clone(skb_dst(old))); + skb_dst_copy(new, old); new->rxhash = old->rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); diff --git a/net/core/sock.c b/net/core/sock.c index 63530a0..bf88a16 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ skb_len = skb->len; + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + spin_lock_irqsave(&list->lock, flags); skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(list, skb); @@ -1536,6 +1541,7 @@ static void __release_sock(struct sock *sk) do { struct sk_buff *next = skb->next; + WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); -- cgit v1.1 From ccbd6a5a4f76e821ed36f69fdaf59817c3a7f18e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 14 May 2010 10:58:26 +0000 Subject: net: Remove unnecessary semicolons after switch statements Also added an explicit break; to avoid a fallthrough in net/ipv4/tcp_input.c Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/core/ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1a7db92..a0f4964 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -522,7 +522,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) p += ETH_GSTRING_LEN; num_strings++; goto unknown_filter; - }; + } /* now the rest of the filters */ switch (fsc->fs.flow_type) { @@ -646,7 +646,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) p += ETH_GSTRING_LEN; num_strings++; break; - }; + } sprintf(p, "\tVLAN: %d, mask: 0x%x\n", fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); p += ETH_GSTRING_LEN; -- cgit v1.1 From 57b610805ce92dbd79fc97509f80fa5391b99623 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Mon, 17 May 2010 22:49:55 -0700 Subject: net: Add netlink support for virtual port management (was iovnl) Add new netdev ops ndo_{set|get}_vf_port to allow setting of port-profile on a netdev interface. Extends netlink socket RTM_SETLINK/ RTM_GETLINK with two new sub msgs called IFLA_VF_PORTS and IFLA_PORT_SELF (added to end of IFLA_cmd list). These are both nested atrtibutes using this layout: [IFLA_NUM_VF] [IFLA_VF_PORTS] [IFLA_VF_PORT] [IFLA_PORT_*], ... [IFLA_VF_PORT] [IFLA_PORT_*], ... ... [IFLA_PORT_SELF] [IFLA_PORT_*], ... These attributes are design to be set and get symmetrically. VF_PORTS is a list of VF_PORTs, one for each VF, when dealing with an SR-IOV device. PORT_SELF is for the PF of the SR-IOV device, in case it wants to also have a port-profile, or for the case where the VF==PF, like in enic patch 2/2 of this patch set. A port-profile is used to configure/enable the external switch virtual port backing the netdev interface, not to configure the host-facing side of the netdev. A port-profile is an identifier known to the switch. How port- profiles are installed on the switch or how available port-profiles are made know to the host is outside the scope of this patch. There are two types of port-profiles specs in the netlink msg. The first spec is for 802.1Qbg (pre-)standard, VDP protocol. The second spec is for devices that run a similar protocol as VDP but in firmware, thus hiding the protocol details. In either case, the specs have much in common and makes sense to define the netlink msg as the union of the two specs. For example, both specs have a notition of associating/deassociating a port-profile. And both specs require some information from the hypervisor manager, such as client port instance ID. The general flow is the port-profile is applied to a host netdev interface using RTM_SETLINK, the receiver of the RTM_SETLINK msg communicates with the switch, and the switch virtual port backing the host netdev interface is configured/enabled based on the settings defined by the port-profile. What those settings comprise, and how those settings are managed is again outside the scope of this patch, since this patch only deals with the first step in the flow. Signed-off-by: Scott Feldman Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 66db120..e4b9870 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -660,6 +660,31 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev) return 0; } +static size_t rtnl_port_size(const struct net_device *dev) +{ + size_t port_size = nla_total_size(4) /* PORT_VF */ + + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + + nla_total_size(sizeof(struct ifla_port_vsi)) + /* PORT_VSI_TYPE */ + + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + + nla_total_size(1) /* PROT_VDP_REQUEST */ + + nla_total_size(2); /* PORT_VDP_RESPONSE */ + size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); + size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + if (dev_num_vf(dev->dev.parent)) + return port_self_size + vf_ports_size + + vf_port_size * dev_num_vf(dev->dev.parent); + else + return port_self_size; +} + static inline size_t if_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) @@ -680,9 +705,82 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ } +static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *vf_ports; + struct nlattr *vf_port; + int vf; + int err; + + vf_ports = nla_nest_start(skb, IFLA_VF_PORTS); + if (!vf_ports) + return -EMSGSIZE; + + for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { + vf_port = nla_nest_start(skb, IFLA_VF_PORT); + if (!vf_port) { + nla_nest_cancel(skb, vf_ports); + return -EMSGSIZE; + } + NLA_PUT_U32(skb, IFLA_PORT_VF, vf); + err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); + if (err) { +nla_put_failure: + nla_nest_cancel(skb, vf_port); + continue; + } + nla_nest_end(skb, vf_port); + } + + nla_nest_end(skb, vf_ports); + + return 0; +} + +static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *port_self; + int err; + + port_self = nla_nest_start(skb, IFLA_PORT_SELF); + if (!port_self) + return -EMSGSIZE; + + err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); + if (err) { + nla_nest_cancel(skb, port_self); + return err; + } + + nla_nest_end(skb, port_self); + + return 0; +} + +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + + err = rtnl_port_self_fill(skb, dev); + if (err) + return err; + + if (dev_num_vf(dev->dev.parent)) { + err = rtnl_vf_ports_fill(skb, dev); + if (err) + return err; + } + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags) @@ -754,13 +852,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; copy_rtnl_link_stats64(nla_data(attr), stats); + if (dev->dev.parent) + NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { int i; struct nlattr *vfinfo, *vf; int num_vfs = dev_num_vf(dev->dev.parent); - NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); if (!vfinfo) goto nla_put_failure; @@ -788,6 +888,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, } nla_nest_end(skb, vfinfo); } + + if (rtnl_port_fill(skb, dev)) + goto nla_put_failure; + if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; @@ -849,6 +953,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, + [IFLA_VF_PORTS] = { .type = NLA_NESTED }, + [IFLA_PORT_SELF] = { .type = NLA_NESTED }, }; EXPORT_SYMBOL(ifla_policy); @@ -870,6 +976,20 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { .len = sizeof(struct ifla_vf_tx_rate) }, }; +static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { + [IFLA_PORT_VF] = { .type = NLA_U32 }, + [IFLA_PORT_PROFILE] = { .type = NLA_STRING, + .len = PORT_PROFILE_MAX }, + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi)}, + [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, + .len = PORT_UUID_MAX }, + [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, + .len = PORT_UUID_MAX }, + [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, + [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, +}; + struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; @@ -1089,6 +1209,53 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, } err = 0; + if (tb[IFLA_VF_PORTS]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + struct nlattr *attr; + int vf; + int rem; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_port) + goto errout; + + nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { + if (nla_type(attr) != IFLA_VF_PORT) + continue; + err = nla_parse_nested(port, IFLA_PORT_MAX, + attr, ifla_port_policy); + if (err < 0) + goto errout; + if (!port[IFLA_PORT_VF]) { + err = -EOPNOTSUPP; + goto errout; + } + vf = nla_get_u32(port[IFLA_PORT_VF]); + err = ops->ndo_set_vf_port(dev, vf, port); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_PORT_SELF]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + + err = nla_parse_nested(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], ifla_port_policy); + if (err < 0) + goto errout; + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_port) + err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); + if (err < 0) + goto errout; + modified = 1; + } + err = 0; + errout: if (err < 0 && modified && net_ratelimit()) printk(KERN_WARNING "A link change request failed with " -- cgit v1.1 From 608b4b9548dedf4185ca47edcaae4bff2ceb62de Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 4 May 2010 17:36:45 -0700 Subject: netns: Teach network device kobjects which namespace they are in. The problem. Network devices show up in sysfs and with the network namespace active multiple devices with the same name can show up in the same directory, ouch! To avoid that problem and allow existing applications in network namespaces to see the same interface that is currently presented in sysfs, this patch enables the tagging directory support in sysfs. By using the network namespace pointers as tags to separate out the the sysfs directory entries we ensure that we don't have conflicts in the directories and applications only see a limited set of the network devices. Signed-off-by: Eric W. Biederman Acked-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/net-sysfs.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c57c4b2..b388cda 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -14,7 +14,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -766,6 +768,38 @@ static void rx_queue_remove_kobjects(struct net_device *net) kset_unregister(net->queues_kset); } #endif /* CONFIG_RPS */ + +static const void *net_current_ns(void) +{ + return current->nsproxy->net_ns; +} + +static const void *net_initial_ns(void) +{ + return &init_net; +} + +static const void *net_netlink_ns(struct sock *sk) +{ + return sock_net(sk); +} + +static struct kobj_ns_type_operations net_ns_type_operations = { + .type = KOBJ_NS_TYPE_NET, + .current_ns = net_current_ns, + .netlink_ns = net_netlink_ns, + .initial_ns = net_initial_ns, +}; + +static void net_kobj_ns_exit(struct net *net) +{ + kobj_ns_exit(KOBJ_NS_TYPE_NET, net); +} + +static struct pernet_operations sysfs_net_ops = { + .exit = net_kobj_ns_exit, +}; + #endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG @@ -806,6 +840,13 @@ static void netdev_release(struct device *d) kfree((char *)dev - dev->padded); } +static const void *net_namespace(struct device *d) +{ + struct net_device *dev; + dev = container_of(d, struct net_device, dev); + return dev_net(dev); +} + static struct class net_class = { .name = "net", .dev_release = netdev_release, @@ -815,6 +856,8 @@ static struct class net_class = { #ifdef CONFIG_HOTPLUG .dev_uevent = netdev_uevent, #endif + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, }; /* Delete sysfs entries but hold kobject reference until after all @@ -904,5 +947,9 @@ void netdev_initialize_kobject(struct net_device *net) int netdev_kobject_init(void) { + kobj_ns_type_register(&net_ns_type_operations); +#ifdef CONFIG_SYSFS + register_pernet_subsys(&sysfs_net_ops); +#endif return class_register(&net_class); } -- cgit v1.1 From d6523ddf2376f39eaa89a4d68a33052d20c138b9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 16 May 2010 21:59:45 -0700 Subject: net/sysfs: Fix the bitrot in network device kobject namespace support I had a couple of stupid bugs in: netns: Teach network device kobjects which namespace they are in. - I duplicated the Kconfig for the NET_NS - The build was broken when sysfs was not compiled in The sysfs breakage is because after I moved the operations for the sysfs to the kobject layer, to make things cleaner I forgot to move the ifdefs. Opps. I'm not quite certain how I got introduced a second NET_NS Kconfig, but it was probably a 3 way merge somewhere along the way that did not notice that the NET_NS Kconfig option had mvoed and thout that was a bug. It probably slipped in because it used to be the sysfs patches were the first patches in my network namespace patches. Some things just don't go like you would expect. Neither of these bugs actually affect anything in the common case but they should be fixed. Thanks to Serge for noticing they were present. Reported-by: Serge E. Hallyn Signed-off-by: Eric W. Biederman Acked-by: David S. Miller --- net/core/net-sysfs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b388cda..6881e65 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -469,6 +469,7 @@ static struct attribute_group wireless_group = { .attrs = wireless_attrs, }; #endif +#endif /* CONFIG_SYSFS */ #ifdef CONFIG_RPS /* @@ -796,11 +797,10 @@ static void net_kobj_ns_exit(struct net *net) kobj_ns_exit(KOBJ_NS_TYPE_NET, net); } -static struct pernet_operations sysfs_net_ops = { +static struct pernet_operations kobj_net_ops = { .exit = net_kobj_ns_exit, }; -#endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) @@ -948,8 +948,6 @@ void netdev_initialize_kobject(struct net_device *net) int netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); -#ifdef CONFIG_SYSFS - register_pernet_subsys(&sysfs_net_ops); -#endif + register_pernet_subsys(&kobj_net_ops); return class_register(&net_class); } -- cgit v1.1 From a1b3f594dc5faab91d3a218c7019e9b5edd9fe1a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 4 May 2010 17:36:49 -0700 Subject: net: Expose all network devices in a namespaces in sysfs This reverts commit aaf8cdc34ddba08122f02217d9d684e2f9f5d575. Drivers like the ipw2100 call device_create_group when they are initialized and device_remove_group when they are shutdown. Moving them between namespaces deletes their sysfs groups early. In particular the following call chain results. netdev_unregister_kobject -> device_del -> kobject_del -> sysfs_remove_dir With sysfs_remove_dir recursively deleting all of it's subdirectories, and nothing adding them back. Ouch! Therefore we need to call something that ultimate calls sysfs_mv_dir as that sysfs function can move sysfs directories between namespaces without deleting their subdirectories or their contents. Allowing us to avoid placing extra boiler plate into every driver that does something interesting with sysfs. Currently the function that provides that capability is device_rename. That is the code works without nasty side effects as originally written. So remove the misguided fix for moving devices between namespaces. The bug in the kobject layer that inspired it has now been recognized and fixed. Signed-off-by: Eric W. Biederman Acked-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 28 +++++----------------------- net/core/net-sysfs.c | 16 +--------------- net/core/net-sysfs.h | 1 - 3 files changed, 6 insertions(+), 39 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6c82065..d273e4e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1002,15 +1002,10 @@ int dev_change_name(struct net_device *dev, const char *newname) return err; rollback: - /* For now only devices in the initial network namespace - * are in sysfs. - */ - if (net_eq(net, &init_net)) { - ret = device_rename(&dev->dev, dev->name); - if (ret) { - memcpy(dev->name, oldname, IFNAMSIZ); - return ret; - } + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; } write_lock_bh(&dev_base_lock); @@ -4994,8 +4989,6 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; - netdev_initialize_kobject(dev); - ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -5547,15 +5540,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char if (dev->features & NETIF_F_NETNS_LOCAL) goto out; -#ifdef CONFIG_SYSFS - /* Don't allow real devices to be moved when sysfs - * is enabled. - */ - err = -EINVAL; - if (dev->dev.parent) - goto out; -#endif - /* Ensure the device has been registrered */ err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) @@ -5606,8 +5590,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_uc_flush(dev); dev_mc_flush(dev); - netdev_unregister_kobject(dev); - /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -5620,7 +5602,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char } /* Fixup kobjects */ - err = netdev_register_kobject(dev); + err = device_rename(&dev->dev, dev->name); WARN_ON(err); /* Add the device back in the hashes */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 6881e65..99e7052 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -808,9 +808,6 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) struct net_device *dev = to_net_dev(d); int retval; - if (!net_eq(dev_net(dev), &init_net)) - return 0; - /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) @@ -869,9 +866,6 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); - if (!net_eq(dev_net(net), &init_net)) - return; - #ifdef CONFIG_RPS rx_queue_remove_kobjects(net); #endif @@ -886,6 +880,7 @@ int netdev_register_kobject(struct net_device *net) const struct attribute_group **groups = net->sysfs_groups; int error = 0; + device_initialize(dev); dev->class = &net_class; dev->platform_data = net; dev->groups = groups; @@ -908,9 +903,6 @@ int netdev_register_kobject(struct net_device *net) #endif #endif /* CONFIG_SYSFS */ - if (!net_eq(dev_net(net), &init_net)) - return 0; - error = device_add(dev); if (error) return error; @@ -939,12 +931,6 @@ void netdev_class_remove_file(struct class_attribute *class_attr) EXPORT_SYMBOL(netdev_class_create_file); EXPORT_SYMBOL(netdev_class_remove_file); -void netdev_initialize_kobject(struct net_device *net) -{ - struct device *device = &(net->dev); - device_initialize(device); -} - int netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 14e7524..805555e 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -4,5 +4,4 @@ int netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); -void netdev_initialize_kobject(struct net_device *); #endif -- cgit v1.1