diff options
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 213 |
1 files changed, 180 insertions, 33 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 75ef66f..6b95f74 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/prefetch.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -132,6 +133,9 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; +static struct delayed_work expires_work; +static unsigned long expires_ljiffies; + /* * Interface to generic destination cache. */ @@ -412,7 +416,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "HHUptod\tSpecDst"); else { struct rtable *r = v; - int len; + struct neighbour *n; + int len, HHUptod; + + rcu_read_lock(); + n = dst_get_neighbour(&r->dst); + HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; + rcu_read_unlock(); seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", @@ -427,8 +437,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) dst_metric(&r->dst, RTAX_RTTVAR)), r->rt_key_tos, r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, - r->dst.hh ? (r->dst.hh->hh_output == - dev_queue_xmit) : 0, + HHUptod, r->rt_spec_dst, &len); seq_printf(seq, "%*s\n", 127 - len, ""); @@ -821,6 +830,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) return ONE; } +static void rt_check_expire(void) +{ + static unsigned int rover; + unsigned int i = rover, goal; + struct rtable *rth; + struct rtable __rcu **rthp; + unsigned long samples = 0; + unsigned long sum = 0, sum2 = 0; + unsigned long delta; + u64 mult; + + delta = jiffies - expires_ljiffies; + expires_ljiffies = jiffies; + mult = ((u64)delta) << rt_hash_log; + if (ip_rt_gc_timeout > 1) + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + if (goal > rt_hash_mask) + goal = rt_hash_mask + 1; + for (; goal > 0; goal--) { + unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; + + i = (i + 1) & rt_hash_mask; + rthp = &rt_hash_table[i].chain; + + if (need_resched()) + cond_resched(); + + samples++; + + if (rcu_dereference_raw(*rthp) == NULL) + continue; + length = 0; + spin_lock_bh(rt_hash_lock_addr(i)); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { + prefetch(rth->dst.rt_next); + if (rt_is_expired(rth)) { + *rthp = rth->dst.rt_next; + rt_free(rth); + continue; + } + if (rth->dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(jiffies, rth->dst.expires)) { +nofree: + tmo >>= 1; + rthp = &rth->dst.rt_next; + /* + * We only count entries on + * a chain with equal hash inputs once + * so that entries for different QOS + * levels, and other non-hash input + * attributes don't unfairly skew + * the length computation + */ + length += has_noalias(rt_hash_table[i].chain, rth); + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; + + /* Cleanup aged off entries. */ + *rthp = rth->dst.rt_next; + rt_free(rth); + } + spin_unlock_bh(rt_hash_lock_addr(i)); + sum += length; + sum2 += length*length; + } + if (samples) { + unsigned long avg = sum / samples; + unsigned long sd = int_sqrt(sum2 / samples - avg*avg); + rt_chain_length_max = max_t(unsigned long, + ip_rt_gc_elasticity, + (avg + 4*sd) >> FRACT_BITS); + } + rover = i; +} + +/* + * rt_worker_func() is run in process context. + * we call rt_check_expire() to scan part of the hash table + */ +static void rt_worker_func(struct work_struct *work) +{ + rt_check_expire(); + schedule_delayed_work(&expires_work, ip_rt_gc_interval); +} + /* * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -1269,11 +1369,41 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + struct neighbour *n, *old_n; + + dst_confirm(&rt->dst); + + rt->rt_gateway = peer->redirect_learned.a4; + n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + old_n = xchg(&rt->dst._neighbour, n); + if (old_n) + neigh_release(old_n); + if (!n || !(n->nud_state & NUD_VALID)) { + if (n) + neigh_event_send(n, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + } + return 0; +} + /* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { + int s, i; struct in_device *in_dev = __in_dev_get_rcu(dev); + __be32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; struct inet_peer *peer; struct net *net; @@ -1296,13 +1426,43 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - peer = inet_getpeer_v4(daddr, 1); - if (peer) { - peer->redirect_learned.a4 = new_gw; + for (s = 0; s < 2; s++) { + for (i = 0; i < 2; i++) { + unsigned int hash; + struct rtable __rcu **rthp; + struct rtable *rt; - inet_putpeer(peer); + hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); - atomic_inc(&__rt_peer_genid); + rthp = &rt_hash_table[hash].chain; + + while ((rt = rcu_dereference(*rthp)) != NULL) { + rthp = &rt->dst.rt_next; + + if (rt->rt_key_dst != daddr || + rt->rt_key_src != skeys[s] || + rt->rt_oif != ikeys[i] || + rt_is_input_route(rt) || + rt_is_expired(rt) || + !net_eq(dev_net(rt->dst.dev), net) || + rt->dst.error || + rt->dst.dev != dev || + rt->rt_gateway != old_gw) + continue; + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + if (peer->redirect_learned.a4 != new_gw) { + peer->redirect_learned.a4 = new_gw; + atomic_inc(&__rt_peer_genid); + } + check_peer_redir(&rt->dst, peer); + } + } + } } return; @@ -1589,31 +1749,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) } } -static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) -{ - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - - dst_confirm(&rt->dst); - - neigh_release(rt->dst.neighbour); - rt->dst.neighbour = NULL; - - rt->rt_gateway = peer->redirect_learned.a4; - if (arp_bind_neighbour(&rt->dst) || - !(rt->dst.neighbour->nud_state & NUD_VALID)) { - if (rt->dst.neighbour) - neigh_event_send(rt->dst.neighbour, NULL); - rt->rt_gateway = orig_gw; - return -EAGAIN; - } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, - rt->dst.neighbour); - } - return 0; -} - static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; @@ -3088,6 +3223,13 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec_jiffies, }, { + .procname = "gc_interval", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), @@ -3297,6 +3439,11 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); + INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); + expires_ljiffies = jiffies; + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); + if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM |