diff options
Diffstat (limited to 'net')
43 files changed, 1318 insertions, 578 deletions
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 26f3227..1958ad1 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -1011,7 +1011,7 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsign /* ---- TTY structure ---- */ -static struct tty_operations rfcomm_ops = { +static const struct tty_operations rfcomm_ops = { .open = rfcomm_tty_open, .close = rfcomm_tty_close, .write = rfcomm_tty_write, diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 770c0df..b54306a 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -22,24 +22,37 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr, const void *data, unsigned int datalen) { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; + int action = info->target & -16; - if ((*pskb)->nfmark != info->mark) + if (action == MARK_SET_VALUE) (*pskb)->nfmark = info->mark; + else if (action == MARK_OR_VALUE) + (*pskb)->nfmark |= info->mark; + else if (action == MARK_AND_VALUE) + (*pskb)->nfmark &= info->mark; + else + (*pskb)->nfmark ^= info->mark; - return info->target; + return info->target | -16; } static int ebt_target_mark_check(const char *tablename, unsigned int hookmask, const struct ebt_entry *e, void *data, unsigned int datalen) { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; + int tmp; if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_t_info))) return -EINVAL; - if (BASE_CHAIN && info->target == EBT_RETURN) + tmp = info->target | -16; + if (BASE_CHAIN && tmp == EBT_RETURN) return -EINVAL; CLEAR_BASE_CHAIN_BIT; - if (INVALID_TARGET) + if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) + return -EINVAL; + tmp = info->target & -16; + if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE && + tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE) return -EINVAL; return 0; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8ce8c47..b4b4783 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -344,12 +344,12 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, { struct neighbour *n; int key_len = tbl->key_len; - u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + u32 hash_val = tbl->hash(pkey, dev); NEIGH_CACHE_STAT_INC(tbl, lookups); read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); @@ -364,12 +364,12 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) { struct neighbour *n; int key_len = tbl->key_len; - u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; + u32 hash_val = tbl->hash(pkey, NULL); NEIGH_CACHE_STAT_INC(tbl, lookups); read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { if (!memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); @@ -1998,12 +1998,12 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + read_lock_bh(&tbl->lock); for (h = 0; h <= tbl->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { if (idx < s_idx) continue; @@ -2016,8 +2016,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, goto out; } } - read_unlock_bh(&tbl->lock); } + read_unlock_bh(&tbl->lock); rc = skb->len; out: cb->args[1] = h; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c448c7f..3c23760 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -156,7 +156,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); - data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + data = kmalloc_track_caller(size + sizeof(struct skb_shared_info), + gfp_mask); if (!data) goto nodata; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 30af4a4..5572071 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -64,7 +64,7 @@ config ASK_IP_FIB_HASH config IP_FIB_TRIE bool "FIB_TRIE" ---help--- - Use new experimental LC-trie as FIB lookup algoritm. + Use new experimental LC-trie as FIB lookup algorithm. This improves lookup performance if you have a large number of routes. @@ -434,6 +434,15 @@ config INET_XFRM_MODE_TUNNEL If unsure, say Y. +config INET_XFRM_MODE_BEET + tristate "IP: IPsec BEET mode" + default y + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y @@ -526,7 +535,7 @@ config TCP_CONG_HYBLA ---help--- TCP-Hybla is a sender-side only change that eliminates penalization of long-RTT, large-bandwidth connections, like when satellite legs are - involved, expecially when sharing a common bottleneck with normal + involved, especially when sharing a common bottleneck with normal terrestrial connections. config TCP_CONG_VEGAS @@ -556,7 +565,7 @@ config TCP_CONG_LP default n ---help--- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is - to utiliza only the excess network bandwidth as compared to the + to utilize only the excess network bandwidth as compared to the ``fair share`` of bandwidth as targeted by TCP. See http://www-ece.rice.edu/networks/TCP-LP/ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f66049e..15645c5 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o +obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cfe5c84..cfb5d3d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,4 +1,4 @@ -/* linux/net/inet/arp.c +/* linux/net/ipv4/arp.c * * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ * diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 13b2936..b5c205b 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -253,7 +253,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) * as per draft-ietf-ipsec-udp-encaps-06, * section 3.1.2 */ - if (x->props.mode == XFRM_MODE_TRANSPORT) + if (x->props.mode == XFRM_MODE_TRANSPORT || + x->props.mode == XFRM_MODE_BEET) skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -271,17 +272,28 @@ static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) { struct esp_data *esp = x->data; u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); - - if (x->props.mode == XFRM_MODE_TUNNEL) { - mtu = ALIGN(mtu + 2, blksize); - } else { - /* The worst case. */ + int enclen = 0; + + switch (x->props.mode) { + case XFRM_MODE_TUNNEL: + mtu = ALIGN(mtu +2, blksize); + break; + default: + case XFRM_MODE_TRANSPORT: + /* The worst case */ mtu = ALIGN(mtu + 2, 4) + blksize - 4; + break; + case XFRM_MODE_BEET: + /* The worst case. */ + enclen = IPV4_BEET_PHMAXLEN; + mtu = ALIGN(mtu + enclen + 2, blksize); + break; } + if (esp->conf.padlen) mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_trunc_len; + return mtu + x->props.header_len + esp->auth.icv_trunc_len - enclen; } static void esp4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2017d36..3839b70 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -206,6 +206,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t; + u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (t == NULL) @@ -216,7 +217,9 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->id.daddr.a4 = x->id.daddr.a4; memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET; - t->props.mode = XFRM_MODE_TUNNEL; + if (x->props.mode == XFRM_MODE_BEET) + mode = x->props.mode; + t->props.mode = mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 1fbb384..f8ce847 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -366,7 +366,7 @@ static int __init ic_defaults(void) */ if (!ic_host_name_set) - sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); if (root_server_addr == INADDR_NONE) root_server_addr = ic_servaddr; @@ -805,7 +805,7 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); ic_host_name_set = 1; break; case 15: /* Domain name (DNS) */ @@ -816,7 +816,7 @@ static void __init ic_do_bootp_ext(u8 *ext) ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); break; case 40: /* NIS Domain name (_not_ DNS) */ - ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; } } @@ -1368,7 +1368,7 @@ static int __init ip_auto_config(void) printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); printk(",\n host=%s, domain=%s, nis-domain=%s", - system_utsname.nodename, ic_domain, system_utsname.domainname); + utsname()->nodename, ic_domain, utsname()->domainname); printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); printk(", rootpath=%s", root_server_path); @@ -1478,11 +1478,11 @@ static int __init ip_auto_config_setup(char *addrs) case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; - strlcpy(system_utsname.domainname, dp, - sizeof(system_utsname.domainname)); + strlcpy(utsname()->domainname, dp, + sizeof(utsname()->domainname)); } - strlcpy(system_utsname.nodename, ip, - sizeof(system_utsname.nodename)); + strlcpy(utsname()->nodename, ip, + sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index c9820bf..891b935 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -81,7 +81,7 @@ config IP_VS_PROTO_ESP bool "ESP load balancing support" depends on IP_VS ---help--- - This option enables support for load balancing ESP (Encapsultion + This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH @@ -204,7 +204,7 @@ config IP_VS_SED connections to the server with the shortest expected delay. The expected delay that the job will experience is (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of connections - on the the ith server and Ui is the fixed service rate (weight) + on the ith server and Ui is the fixed service rate (weight) of the ith server. If you want to compile it in kernel, say Y. To compile it as a diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 6dee039..1445bb4 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -813,6 +813,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, skb->nh.iph->saddr = cp->vaddr; ip_send_check(skb->nh.iph); + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_route_me_harder(pskb, RTN_LOCAL) != 0) + goto drop; + skb = *pskb; + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 5ac1537..e2005c6 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -8,7 +8,7 @@ #include <net/ip.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff **pskb) +int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type) { struct iphdr *iph = (*pskb)->nh.iph; struct rtable *rt; @@ -16,10 +16,13 @@ int ip_route_me_harder(struct sk_buff **pskb) struct dst_entry *odst; unsigned int hh_len; + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(iph->saddr); + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + if (addr_type == RTN_LOCAL) { fl.nl_u.ip4_u.daddr = iph->daddr; fl.nl_u.ip4_u.saddr = iph->saddr; fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); @@ -156,7 +159,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info) if (!(iph->tos == rt_info->tos && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(pskb); + return ip_route_me_harder(pskb, RTN_UNSPEC); } return 0; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a55b8ff..d88c292 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -373,7 +373,7 @@ config IP_NF_TARGET_ULOG daemon using netlink multicast sockets; unlike the LOG target which can only be viewed through syslog. - The apropriate userspace logging daemon (ulogd) may be obtained from + The appropriate userspace logging daemon (ulogd) may be obtained from <http://www.gnumonks.org/projects/ulogd/> To compile it as a module, choose M here. If unsure, say N. diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 021395b..d85d2de 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -265,7 +265,8 @@ ip_nat_local_fn(unsigned int hooknum, ct->tuplehash[!dir].tuple.src.u.all #endif ) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; } return ret; } diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index fd0c05e..ad0312d 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -38,76 +38,16 @@ MODULE_DESCRIPTION("iptables REJECT target module"); #define DEBUGP(format, args...) #endif -static inline struct rtable *route_reverse(struct sk_buff *skb, - struct tcphdr *tcph, int hook) -{ - struct iphdr *iph = skb->nh.iph; - struct dst_entry *odst; - struct flowi fl = {}; - struct rtable *rt; - - /* We don't require ip forwarding to be enabled to be able to - * send a RST reply for bridged traffic. */ - if (hook != NF_IP_FORWARD -#ifdef CONFIG_BRIDGE_NETFILTER - || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED) -#endif - ) { - fl.nl_u.ip4_u.daddr = iph->saddr; - if (hook == NF_IP_LOCAL_IN) - fl.nl_u.ip4_u.saddr = iph->daddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - - if (ip_route_output_key(&rt, &fl) != 0) - return NULL; - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl.nl_u.ip4_u.daddr = iph->daddr; - if (ip_route_output_key(&rt, &fl) != 0) - return NULL; - - odst = skb->dst; - if (ip_route_input(skb, iph->saddr, iph->daddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); - return NULL; - } - dst_release(&rt->u.dst); - rt = (struct rtable *)skb->dst; - skb->dst = odst; - - fl.nl_u.ip4_u.daddr = iph->saddr; - fl.nl_u.ip4_u.saddr = iph->daddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - } - - if (rt->u.dst.error) { - dst_release(&rt->u.dst); - return NULL; - } - - fl.proto = IPPROTO_TCP; - fl.fl_ip_sport = tcph->dest; - fl.fl_ip_dport = tcph->source; - security_skb_classify_flow(skb, &fl); - - xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); - - return rt; -} - /* Send RST reply */ static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; struct iphdr *iph = oldskb->nh.iph; struct tcphdr _otcph, *oth, *tcph; - struct rtable *rt; __be16 tmp_port; __be32 tmp_addr; int needs_ack; - int hh_len; + unsigned int addr_type; /* IP header checks: fragment. */ if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) @@ -126,23 +66,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (nf_ip_checksum(oldskb, hook, iph->ihl * 4, IPPROTO_TCP)) return; - if ((rt = route_reverse(oldskb, oth, hook)) == NULL) - return; - - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - /* We need a linear, writeable skb. We also need to expand headroom in case hh_len of incoming interface < hh_len of outgoing interface */ - nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb), + nskb = skb_copy_expand(oldskb, LL_MAX_HEADER, skb_tailroom(oldskb), GFP_ATOMIC); - if (!nskb) { - dst_release(&rt->u.dst); + if (!nskb) return; - } - - dst_release(nskb->dst); - nskb->dst = &rt->u.dst; /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); @@ -184,6 +114,21 @@ static void send_reset(struct sk_buff *oldskb, int hook) tcph->window = 0; tcph->urg_ptr = 0; + /* Set DF, id = 0 */ + nskb->nh.iph->frag_off = htons(IP_DF); + nskb->nh.iph->id = 0; + + addr_type = RTN_UNSPEC; + if (hook != NF_IP_FORWARD +#ifdef CONFIG_BRIDGE_NETFILTER + || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) +#endif + ) + addr_type = RTN_LOCAL; + + if (ip_route_me_harder(&nskb, addr_type)) + goto free_nskb; + /* Adjust TCP checksum */ nskb->ip_summed = CHECKSUM_NONE; tcph->check = 0; @@ -192,12 +137,8 @@ static void send_reset(struct sk_buff *oldskb, int hook) nskb->nh.iph->daddr, csum_partial((char *)tcph, sizeof(struct tcphdr), 0)); - - /* Adjust IP TTL, DF */ + /* Adjust IP TTL */ nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); - /* Set DF, id = 0 */ - nskb->nh.iph->frag_off = htons(IP_DF); - nskb->nh.iph->id = 0; /* Adjust IP checksum */ nskb->nh.iph->check = 0; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index e62ea2b..b91f358 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -157,7 +157,8 @@ ipt_local_hook(unsigned int hook, || (*pskb)->nfmark != nfmark #endif || (*pskb)->nh.iph->tos != tos)) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; return ret; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3f884ce..cf06acc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2259,7 +2259,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) u32 pkts_acked = 0; void (*rtt_sample)(struct sock *sk, u32 usrtt) = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv; + struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index dab37d2..4be336f 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -99,8 +99,10 @@ static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, } static struct jprobe tcp_send_probe = { - .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, - .entry = (kprobe_opcode_t *) &jtcp_sendmsg, + .kp = { + .symbol_name = "tcp_sendmsg", + }, + .entry = JPROBE_ENTRY(jtcp_sendmsg), }; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6d6142f..865d752 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -675,6 +675,8 @@ do_append_data: udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk, up); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; release_sock(sk); out: diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c new file mode 100644 index 0000000..89cf59e --- /dev/null +++ b/net/ipv4/xfrm4_mode_beet.c @@ -0,0 +1,139 @@ +/* + * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4. + * + * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com> + * Miika Komu <miika@iki.fi> + * Herbert Xu <herbert@gondor.apana.org.au> + * Abhinav Pathak <abhinav.pathak@hiit.fi> + * Jeff Ahrenholz <ahrenholz@gmail.com> + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dst.h> +#include <net/ip.h> +#include <net/xfrm.h> + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + * The following fields in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph, *top_iph = NULL; + int hdrlen, optlen; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + hdrlen = 0; + optlen = iph->ihl * 4 - sizeof(*iph); + if (unlikely(optlen)) + hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb->nh.raw = skb_push(skb, x->props.header_len + hdrlen); + top_iph = skb->nh.iph; + hdrlen = iph->ihl * 4 - optlen; + skb->h.raw += hdrlen; + + memmove(top_iph, iph, hdrlen); + if (unlikely(optlen)) { + struct ip_beet_phdr *ph; + + BUG_ON(optlen < 0); + + ph = (struct ip_beet_phdr *)skb->h.raw; + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = (optlen + ph->padlen + sizeof(*ph)) / 8; + ph->nexthdr = top_iph->protocol; + + top_iph->protocol = IPPROTO_BEETPH; + top_iph->ihl = sizeof(struct iphdr) / 4; + } + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + + return 0; +} + +static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int phlen = 0; + int optlen = 0; + __u8 ph_nexthdr = 0, protocol = 0; + int err = -EINVAL; + + protocol = iph->protocol; + + if (unlikely(iph->protocol == IPPROTO_BEETPH)) { + struct ip_beet_phdr *ph = (struct ip_beet_phdr*)(iph + 1); + + if (!pskb_may_pull(skb, sizeof(*ph))) + goto out; + + phlen = ph->hdrlen * 8; + optlen = phlen - ph->padlen - sizeof(*ph); + if (optlen < 0 || optlen & 3 || optlen > 250) + goto out; + + if (!pskb_may_pull(skb, phlen)) + goto out; + + ph_nexthdr = ph->nexthdr; + } + + skb_push(skb, sizeof(*iph) - phlen + optlen); + memmove(skb->data, skb->nh.raw, sizeof(*iph)); + skb->nh.raw = skb->data; + + iph = skb->nh.iph; + iph->ihl = (sizeof(*iph) + optlen) / 4; + iph->tot_len = htons(skb->len); + iph->daddr = x->sel.daddr.a4; + iph->saddr = x->sel.saddr.a4; + if (ph_nexthdr) + iph->protocol = ph_nexthdr; + else + iph->protocol = protocol; + iph->check = 0; + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm4_beet_mode = { + .input = xfrm4_beet_input, + .output = xfrm4_beet_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, +}; + +static int __init xfrm4_beet_init(void) +{ + return xfrm_register_mode(&xfrm4_beet_mode, AF_INET); +} + +static void __exit xfrm4_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_beet_init); +module_exit(xfrm4_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index a2d211d..a460e81 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -136,6 +136,16 @@ config INET6_XFRM_MODE_TUNNEL If unsure, say Y. +config INET6_XFRM_MODE_BEET + tristate "IPv6: IPsec BEET mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + config INET6_XFRM_MODE_ROUTEOPTIMIZATION tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" depends on IPV6 && EXPERIMENTAL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 0213c66..87274e4 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o +obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index a2860e3..71f59f1 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -199,6 +199,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t = NULL; + u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (!t) @@ -212,7 +213,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET6; - t->props.mode = XFRM_MODE_TUNNEL; + if (x->props.mode == XFRM_MODE_BEET) + mode = x->props.mode; + t->props.mode = mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); if (xfrm_init_state(t)) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9662561..e0c3934 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -546,7 +546,7 @@ static int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct in6_addr *daddr, *final_p = NULL, final; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; - struct flowi *fl = &inet->cork.fl; + struct flowi fl; struct dst_entry *dst; int addr_len = msg->msg_namelen; int ulen = len; @@ -626,19 +626,19 @@ do_udp_sendmsg: } ulen += sizeof(struct udphdr); - memset(fl, 0, sizeof(*fl)); + memset(&fl, 0, sizeof(fl)); if (sin6) { if (sin6->sin6_port == 0) return -EINVAL; - fl->fl_ip_dport = sin6->sin6_port; + fl.fl_ip_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; daddr = &flowlabel->dst; @@ -656,32 +656,32 @@ do_udp_sendmsg: if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl->oif = sin6->sin6_scope_id; + fl.oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - fl->fl_ip_dport = inet->dport; + fl.fl_ip_dport = inet->dport; daddr = &np->daddr; - fl->fl6_flowlabel = np->flow_label; + fl.fl6_flowlabel = np->flow_label; connected = 1; } - if (!fl->oif) - fl->oif = sk->sk_bound_dev_if; + if (!fl.oif) + fl.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); - err = datagram_send_ctl(msg, fl, opt, &hlimit, &tclass); + err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -695,39 +695,39 @@ do_udp_sendmsg: opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); - fl->proto = IPPROTO_UDP; - ipv6_addr_copy(&fl->fl6_dst, daddr); - if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl->fl6_src, &np->saddr); - fl->fl_ip_sport = inet->sport; + fl.proto = IPPROTO_UDP; + ipv6_addr_copy(&fl.fl6_dst, daddr); + if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl_ip_sport = inet->sport; /* merge ip6_build_xmit from ip6_output */ if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl->fl6_dst); - ipv6_addr_copy(&fl->fl6_dst, rt0->addr); + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); final_p = &final; connected = 0; } - if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) { - fl->oif = np->mcast_oif; + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) { + fl.oif = np->mcast_oif; connected = 0; } - security_sk_classify_flow(sk, fl); + security_sk_classify_flow(sk, &fl); - err = ip6_sk_dst_lookup(sk, &dst, fl); + err = ip6_sk_dst_lookup(sk, &dst, &fl); if (err) goto out; if (final_p) - ipv6_addr_copy(&fl->fl6_dst, final_p); + ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) goto out; if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl->fl6_dst)) + if (ipv6_addr_is_multicast(&fl.fl6_dst)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; @@ -763,21 +763,23 @@ back_from_confirm: do_append_data: up->len += ulen; err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, fl, + sizeof(struct udphdr), hlimit, tclass, opt, &fl, (struct rt6_info*)dst, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) err = udp_v6_push_pending_frames(sk, up); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; if (dst) { if (connected) { ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ? + ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? &np->daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl->fl6_src, &np->saddr) ? + ipv6_addr_equal(&fl.fl6_src, &np->saddr) ? &np->saddr : #endif NULL); diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c new file mode 100644 index 0000000..edcfffa --- /dev/null +++ b/net/ipv6/xfrm6_mode_beet.c @@ -0,0 +1,107 @@ +/* + * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6. + * + * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com> + * Miika Komu <miika@iki.fi> + * Herbert Xu <herbert@gondor.apana.org.au> + * Abhinav Pathak <abhinav.pathak@hiit.fi> + * Jeff Ahrenholz <ahrenholz@gmail.com> + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dsfield.h> +#include <net/dst.h> +#include <net/inet_ecn.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + * The following fields in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph, *top_iph; + u8 *prevhdr; + int hdr_len; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + + return 0; +} + +static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + int size = sizeof(struct ipv6hdr); + int err = -EINVAL; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + skb_push(skb, size); + memmove(skb->data, skb->nh.raw, size); + skb->nh.raw = skb->data; + + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + + ip6h = skb->nh.ipv6h; + ip6h->payload_len = htons(skb->len - size); + ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); + ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm6_beet_mode = { + .input = xfrm6_beet_input, + .output = xfrm6_beet_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, +}; + +static int __init xfrm6_beet_init(void) +{ + return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6); +} + +static void __exit xfrm6_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_beet_init); +module_exit(xfrm6_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET); diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 3bcdb46..d50a020 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -79,7 +79,7 @@ static struct tty_driver *driver; hashbin_t *ircomm_tty = NULL; -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = ircomm_tty_open, .close = ircomm_tty_close, .write = ircomm_tty_write, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0a28d2c..ce94732 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -365,7 +365,7 @@ config NETFILTER_XT_MATCH_MULTIPORT config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' - depends on NETFILTER_XTABLES && BRIDGE_NETFILTER + depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER help Physdev packet matching matches against the physical bridge ports the IP packet arrived on or will leave by. diff --git a/net/sched/estimator.c b/net/sched/estimator.c deleted file mode 100644 index 0ebc98e..0000000 --- a/net/sched/estimator.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * net/sched/estimator.c Simple rate estimator. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/init.h> -#include <net/sock.h> -#include <net/pkt_sched.h> - -/* - This code is NOT intended to be used for statistics collection, - its purpose is to provide a base for statistical multiplexing - for controlled load service. - If you need only statistics, run a user level daemon which - periodically reads byte counters. - - Unfortunately, rate estimation is not a very easy task. - F.e. I did not find a simple way to estimate the current peak rate - and even failed to formulate the problem 8)8) - - So I preferred not to built an estimator into the scheduler, - but run this task separately. - Ideally, it should be kernel thread(s), but for now it runs - from timers, which puts apparent top bounds on the number of rated - flows, has minimal overhead on small, but is enough - to handle controlled load service, sets of aggregates. - - We measure rate over A=(1<<interval) seconds and evaluate EWMA: - - avrate = avrate*(1-W) + rate*W - - where W is chosen as negative power of 2: W = 2^(-ewma_log) - - The resulting time constant is: - - T = A/(-ln(1-W)) - - - NOTES. - - * The stored value for avbps is scaled by 2^5, so that maximal - rate is ~1Gbit, avpps is scaled by 2^10. - - * Minimal interval is HZ/4=250msec (it is the greatest common divisor - for HZ=100 and HZ=1024 8)), maximal interval - is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals - are too expensive, longer ones can be implemented - at user level painlessly. - */ - -#define EST_MAX_INTERVAL 5 - -struct qdisc_estimator -{ - struct qdisc_estimator *next; - struct tc_stats *stats; - spinlock_t *stats_lock; - unsigned interval; - int ewma_log; - u64 last_bytes; - u32 last_packets; - u32 avpps; - u32 avbps; -}; - -struct qdisc_estimator_head -{ - struct timer_list timer; - struct qdisc_estimator *list; -}; - -static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; - -/* Estimator array lock */ -static DEFINE_RWLOCK(est_lock); - -static void est_timer(unsigned long arg) -{ - int idx = (int)arg; - struct qdisc_estimator *e; - - read_lock(&est_lock); - for (e = elist[idx].list; e; e = e->next) { - struct tc_stats *st = e->stats; - u64 nbytes; - u32 npackets; - u32 rate; - - spin_lock(e->stats_lock); - nbytes = st->bytes; - npackets = st->packets; - rate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; - e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; - st->bps = (e->avbps+0xF)>>5; - - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; - e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; - e->stats->pps = (e->avpps+0x1FF)>>10; - spin_unlock(e->stats_lock); - } - - mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); - read_unlock(&est_lock); -} - -int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) -{ - struct qdisc_estimator *est; - struct tc_estimator *parm = RTA_DATA(opt); - - if (RTA_PAYLOAD(opt) < sizeof(*parm)) - return -EINVAL; - - if (parm->interval < -2 || parm->interval > 3) - return -EINVAL; - - est = kzalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) - return -ENOBUFS; - - est->interval = parm->interval + 2; - est->stats = stats; - est->stats_lock = stats_lock; - est->ewma_log = parm->ewma_log; - est->last_bytes = stats->bytes; - est->avbps = stats->bps<<5; - est->last_packets = stats->packets; - est->avpps = stats->pps<<10; - - est->next = elist[est->interval].list; - if (est->next == NULL) { - init_timer(&elist[est->interval].timer); - elist[est->interval].timer.data = est->interval; - elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); - elist[est->interval].timer.function = est_timer; - add_timer(&elist[est->interval].timer); - } - write_lock_bh(&est_lock); - elist[est->interval].list = est; - write_unlock_bh(&est_lock); - return 0; -} - -void qdisc_kill_estimator(struct tc_stats *stats) -{ - int idx; - struct qdisc_estimator *est, **pest; - - for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { - int killed = 0; - pest = &elist[idx].list; - while ((est=*pest) != NULL) { - if (est->stats != stats) { - pest = &est->next; - continue; - } - - write_lock_bh(&est_lock); - *pest = est->next; - write_unlock_bh(&est_lock); - - kfree(est); - killed++; - } - if (killed && elist[idx].list == NULL) - del_timer(&elist[idx].timer); - } -} - -EXPORT_SYMBOL(qdisc_kill_estimator); -EXPORT_SYMBOL(qdisc_new_estimator); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 6c058e3..bb3ddd4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -391,7 +391,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q, /* If this triggers, it is a bug in this code, but it need not be fatal */ static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) { - if (!RB_EMPTY_NODE(rb)) { + if (RB_EMPTY_NODE(rb)) { WARN_ON(1); } else { rb_erase(rb, root); diff --git a/net/socket.c b/net/socket.c index 01918f7..6c9b9b3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -825,7 +825,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case FIOGETOWN: case SIOCGPGRP: - err = put_user(sock->file->f_owner.pid, + err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index a6ed2d2..b36b946 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1,5 +1,5 @@ /* - * linux/net/sunrpc/auth_gss.c + * linux/net/sunrpc/auth_gss/auth_gss.c * * RPCSEC_GSS client authentication. * diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 638c0b5..447d9ae 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -903,9 +903,9 @@ out_seq: struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; - /* pointer to the beginning of the procedure-specific results, - * which may be encrypted/checksummed in svcauth_gss_release: */ - __be32 *body_start; + /* save a pointer to the beginning of the encoded verifier, + * for use in encryption/checksumming in svcauth_gss_release: */ + __be32 *verf_start; struct rsc *rsci; }; @@ -968,7 +968,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; - svcdata->body_start = NULL; + svcdata->verf_start = NULL; svcdata->rsci = NULL; gc = &svcdata->clcred; @@ -1097,6 +1097,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) goto complete; case RPC_GSS_PROC_DATA: *authp = rpcsec_gsserr_ctxproblem; + svcdata->verf_start = resv->iov_base + resv->iov_len; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; rqstp->rq_cred = rsci->cred; @@ -1110,7 +1111,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) gc->gc_seq, rsci->mechctx)) goto auth_err; /* placeholders for length and seq. number: */ - svcdata->body_start = resv->iov_base + resv->iov_len; svc_putnl(resv, 0); svc_putnl(resv, 0); break; @@ -1119,7 +1119,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) gc->gc_seq, rsci->mechctx)) goto auth_err; /* placeholders for length and seq. number: */ - svcdata->body_start = resv->iov_base + resv->iov_len; svc_putnl(resv, 0); svc_putnl(resv, 0); break; @@ -1147,6 +1146,32 @@ out: return ret; } +u32 * +svcauth_gss_prepare_to_wrap(struct xdr_buf *resbuf, struct gss_svc_data *gsd) +{ + u32 *p, verf_len; + + p = gsd->verf_start; + gsd->verf_start = NULL; + + /* If the reply stat is nonzero, don't wrap: */ + if (*(p-1) != rpc_success) + return NULL; + /* Skip the verifier: */ + p += 1; + verf_len = ntohl(*p++); + p += XDR_QUADLEN(verf_len); + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* Also don't wrap if the accept stat is nonzero: */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + return NULL; + } + p++; + return p; +} + static inline int svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) { @@ -1160,17 +1185,9 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) int integ_offset, integ_len; int stat = -EINVAL; - p = gsd->body_start; - gsd->body_start = NULL; - /* move accept_stat to right place: */ - memcpy(p, p + 2, 4); - /* Don't wrap in failure case: */ - /* Counting on not getting here if call was not even accepted! */ - if (*p != rpc_success) { - resbuf->head[0].iov_len -= 2 * 4; + p = svcauth_gss_prepare_to_wrap(resbuf, gsd); + if (p == NULL) goto out; - } - p++; integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; integ_len = resbuf->len - integ_offset; BUG_ON(integ_len % 4); @@ -1191,7 +1208,6 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len; resbuf->tail[0].iov_len = 0; - rqstp->rq_restailpage = 0; resv = &resbuf->tail[0]; } else { resv = &resbuf->tail[0]; @@ -1223,24 +1239,16 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) int offset; int pad; - p = gsd->body_start; - gsd->body_start = NULL; - /* move accept_stat to right place: */ - memcpy(p, p + 2, 4); - /* Don't wrap in failure case: */ - /* Counting on not getting here if call was not even accepted! */ - if (*p != rpc_success) { - resbuf->head[0].iov_len -= 2 * 4; + p = svcauth_gss_prepare_to_wrap(resbuf, gsd); + if (p == NULL) return 0; - } - p++; len = p++; offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base; *p++ = htonl(gc->gc_seq); inpages = resbuf->pages; /* XXX: Would be better to write some xdr helper functions for * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ - if (resbuf->tail[0].iov_base && rqstp->rq_restailpage == 0) { + if (resbuf->tail[0].iov_base) { BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base + PAGE_SIZE); BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base); @@ -1258,7 +1266,6 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE; resbuf->tail[0].iov_len = 0; - rqstp->rq_restailpage = 0; } if (gss_wrap(gsd->rsci->mechctx, offset, resbuf, inpages)) return -ENOMEM; @@ -1282,7 +1289,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; /* Release can be called twice, but we only wrap once. */ - if (gsd->body_start == NULL) + if (gsd->verf_start == NULL) goto out; /* normally not set till svc_send, but we need it here: */ /* XXX: what for? Do we mess it up the moment we call svc_putu32 diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 124ff0c..78696f2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -161,10 +161,10 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s } /* save the nodename */ - clnt->cl_nodelen = strlen(system_utsname.nodename); + clnt->cl_nodelen = strlen(utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); return clnt; out_no_auth: diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 26c0531..192dff5 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -70,6 +70,8 @@ EXPORT_SYMBOL(put_rpccred); /* RPC server stuff */ EXPORT_SYMBOL(svc_create); EXPORT_SYMBOL(svc_create_thread); +EXPORT_SYMBOL(svc_create_pooled); +EXPORT_SYMBOL(svc_set_num_threads); EXPORT_SYMBOL(svc_exit_thread); EXPORT_SYMBOL(svc_destroy); EXPORT_SYMBOL(svc_drop); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 44b8d9d..c2c8bb2 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -4,6 +4,10 @@ * High-level RPC service routines * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + * + * Multiple threads pools and NUMAisation + * Copyright (c) 2006 Silicon Graphics, Inc. + * by Greg Banks <gnb@melbourne.sgi.com> */ #include <linux/linkage.h> @@ -12,6 +16,8 @@ #include <linux/net.h> #include <linux/in.h> #include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/module.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> @@ -23,14 +29,252 @@ #define RPC_PARANOIA 1 /* + * Mode for mapping cpus to pools. + */ +enum { + SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */ + SVC_POOL_GLOBAL, /* no mapping, just a single global pool + * (legacy & UP mode) */ + SVC_POOL_PERCPU, /* one pool per cpu */ + SVC_POOL_PERNODE /* one pool per numa node */ +}; + +/* + * Structure for mapping cpus to pools and vice versa. + * Setup once during sunrpc initialisation. + */ +static struct svc_pool_map { + int mode; /* Note: int not enum to avoid + * warnings about "enumeration value + * not handled in switch" */ + unsigned int npools; + unsigned int *pool_to; /* maps pool id to cpu or node */ + unsigned int *to_pool; /* maps cpu or node to pool id */ +} svc_pool_map = { + .mode = SVC_POOL_NONE +}; + + +/* + * Detect best pool mapping mode heuristically, + * according to the machine's topology. + */ +static int +svc_pool_map_choose_mode(void) +{ + unsigned int node; + + if (num_online_nodes() > 1) { + /* + * Actually have multiple NUMA nodes, + * so split pools on NUMA node boundaries + */ + return SVC_POOL_PERNODE; + } + + node = any_online_node(node_online_map); + if (nr_cpus_node(node) > 2) { + /* + * Non-trivial SMP, or CONFIG_NUMA on + * non-NUMA hardware, e.g. with a generic + * x86_64 kernel on Xeons. In this case we + * want to divide the pools on cpu boundaries. + */ + return SVC_POOL_PERCPU; + } + + /* default: one global pool */ + return SVC_POOL_GLOBAL; +} + +/* + * Allocate the to_pool[] and pool_to[] arrays. + * Returns 0 on success or an errno. + */ +static int +svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools) +{ + m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->to_pool) + goto fail; + m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->pool_to) + goto fail_free; + + return 0; + +fail_free: + kfree(m->to_pool); +fail: + return -ENOMEM; +} + +/* + * Initialise the pool map for SVC_POOL_PERCPU mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_percpu(struct svc_pool_map *m) +{ + unsigned int maxpools = highest_possible_processor_id()+1; + unsigned int pidx = 0; + unsigned int cpu; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_online_cpu(cpu) { + BUG_ON(pidx > maxpools); + m->to_pool[cpu] = pidx; + m->pool_to[pidx] = cpu; + pidx++; + } + /* cpus brought online later all get mapped to pool0, sorry */ + + return pidx; +}; + + +/* + * Initialise the pool map for SVC_POOL_PERNODE mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_pernode(struct svc_pool_map *m) +{ + unsigned int maxpools = highest_possible_node_id()+1; + unsigned int pidx = 0; + unsigned int node; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_node_with_cpus(node) { + /* some architectures (e.g. SN2) have cpuless nodes */ + BUG_ON(pidx > maxpools); + m->to_pool[node] = pidx; + m->pool_to[pidx] = node; + pidx++; + } + /* nodes brought online later all get mapped to pool0, sorry */ + + return pidx; +} + + +/* + * Build the global map of cpus to pools and vice versa. + */ +static unsigned int +svc_pool_map_init(void) +{ + struct svc_pool_map *m = &svc_pool_map; + int npools = -1; + + if (m->mode != SVC_POOL_NONE) + return m->npools; + + m->mode = svc_pool_map_choose_mode(); + + switch (m->mode) { + case SVC_POOL_PERCPU: + npools = svc_pool_map_init_percpu(m); + break; + case SVC_POOL_PERNODE: + npools = svc_pool_map_init_pernode(m); + break; + } + + if (npools < 0) { + /* default, or memory allocation failure */ + npools = 1; + m->mode = SVC_POOL_GLOBAL; + } + m->npools = npools; + + return m->npools; +} + +/* + * Set the current thread's cpus_allowed mask so that it + * will only run on cpus in the given pool. + * + * Returns 1 and fills in oldmask iff a cpumask was applied. + */ +static inline int +svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int node; /* or cpu */ + + /* + * The caller checks for sv_nrpools > 1, which + * implies that we've been initialized and the + * map mode is not NONE. + */ + BUG_ON(m->mode == SVC_POOL_NONE); + + switch (m->mode) + { + default: + return 0; + case SVC_POOL_PERCPU: + node = m->pool_to[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(node)); + return 1; + case SVC_POOL_PERNODE: + node = m->pool_to[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, node_to_cpumask(node)); + return 1; + } +} + +/* + * Use the mapping mode to choose a pool for a given CPU. + * Used when enqueueing an incoming RPC. Always returns + * a non-NULL pool pointer. + */ +struct svc_pool * +svc_pool_for_cpu(struct svc_serv *serv, int cpu) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int pidx = 0; + + /* + * SVC_POOL_NONE happens in a pure client when + * lockd is brought up, so silently treat it the + * same as SVC_POOL_GLOBAL. + */ + + switch (m->mode) { + case SVC_POOL_PERCPU: + pidx = m->to_pool[cpu]; + break; + case SVC_POOL_PERNODE: + pidx = m->to_pool[cpu_to_node(cpu)]; + break; + } + return &serv->sv_pools[pidx % serv->sv_nrpools]; +} + + +/* * Create an RPC service */ -struct svc_serv * -svc_create(struct svc_program *prog, unsigned int bufsize) +static struct svc_serv * +__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, + void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; int vers; unsigned int xdrsize; + unsigned int i; if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; @@ -39,6 +283,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize) serv->sv_nrthreads = 1; serv->sv_stats = prog->pg_stats; serv->sv_bufsz = bufsize? bufsize : 4096; + serv->sv_shutdown = shutdown; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -53,20 +298,68 @@ svc_create(struct svc_program *prog, unsigned int bufsize) prog = prog->pg_next; } serv->sv_xdrsize = xdrsize; - INIT_LIST_HEAD(&serv->sv_threads); - INIT_LIST_HEAD(&serv->sv_sockets); INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); + init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); + serv->sv_nrpools = npools; + serv->sv_pools = + kcalloc(sizeof(struct svc_pool), serv->sv_nrpools, + GFP_KERNEL); + if (!serv->sv_pools) { + kfree(serv); + return NULL; + } + + for (i = 0; i < serv->sv_nrpools; i++) { + struct svc_pool *pool = &serv->sv_pools[i]; + + dprintk("initialising pool %u for %s\n", + i, serv->sv_name); + + pool->sp_id = i; + INIT_LIST_HEAD(&pool->sp_threads); + INIT_LIST_HEAD(&pool->sp_sockets); + INIT_LIST_HEAD(&pool->sp_all_threads); + spin_lock_init(&pool->sp_lock); + } + + /* Remove any stale portmap registrations */ svc_register(serv, 0, 0); return serv; } +struct svc_serv * +svc_create(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv)) +{ + return __svc_create(prog, bufsize, /*npools*/1, shutdown); +} + +struct svc_serv * +svc_create_pooled(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv), + svc_thread_fn func, int sig, struct module *mod) +{ + struct svc_serv *serv; + unsigned int npools = svc_pool_map_init(); + + serv = __svc_create(prog, bufsize, npools, shutdown); + + if (serv != NULL) { + serv->sv_function = func; + serv->sv_kill_signal = sig; + serv->sv_module = mod; + } + + return serv; +} + /* - * Destroy an RPC service + * Destroy an RPC service. Should be called with the BKL held */ void svc_destroy(struct svc_serv *serv) @@ -85,12 +378,17 @@ svc_destroy(struct svc_serv *serv) } else printk("svc_destroy: no threads for serv=%p!\n", serv); + del_timer_sync(&serv->sv_temptimer); + while (!list_empty(&serv->sv_tempsocks)) { svsk = list_entry(serv->sv_tempsocks.next, struct svc_sock, sk_list); svc_delete_socket(svsk); } + if (serv->sv_shutdown) + serv->sv_shutdown(serv); + while (!list_empty(&serv->sv_permsocks)) { svsk = list_entry(serv->sv_permsocks.next, struct svc_sock, @@ -102,6 +400,7 @@ svc_destroy(struct svc_serv *serv) /* Unregister service with the portmapper */ svc_register(serv, 0, 0); + kfree(serv->sv_pools); kfree(serv); } @@ -118,18 +417,15 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) if (size > RPCSVC_MAXPAYLOAD) size = RPCSVC_MAXPAYLOAD; pages = 2 + (size+ PAGE_SIZE -1) / PAGE_SIZE; - rqstp->rq_argused = 0; - rqstp->rq_resused = 0; arghi = 0; BUG_ON(pages > RPCSVC_MAXPAGES); while (pages) { struct page *p = alloc_page(GFP_KERNEL); if (!p) break; - rqstp->rq_argpages[arghi++] = p; + rqstp->rq_pages[arghi++] = p; pages--; } - rqstp->rq_arghi = arghi; return ! pages; } @@ -139,24 +435,25 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) static void svc_release_buffer(struct svc_rqst *rqstp) { - while (rqstp->rq_arghi) - put_page(rqstp->rq_argpages[--rqstp->rq_arghi]); - while (rqstp->rq_resused) { - if (rqstp->rq_respages[--rqstp->rq_resused] == NULL) - continue; - put_page(rqstp->rq_respages[rqstp->rq_resused]); - } - rqstp->rq_argused = 0; + int i; + for (i=0; i<ARRAY_SIZE(rqstp->rq_pages); i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); } /* - * Create a server thread + * Create a thread in the given pool. Caller must hold BKL. + * On a NUMA or SMP machine, with a multi-pool serv, the thread + * will be restricted to run on the cpus belonging to the pool. */ -int -svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +static int +__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, + struct svc_pool *pool) { struct svc_rqst *rqstp; int error = -ENOMEM; + int have_oldmask = 0; + cpumask_t oldmask; rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) @@ -170,8 +467,21 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv) goto out_thread; serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); rqstp->rq_server = serv; + rqstp->rq_pool = pool; + + if (serv->sv_nrpools > 1) + have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); + error = kernel_thread((int (*)(void *)) func, rqstp, 0); + + if (have_oldmask) + set_cpus_allowed(current, oldmask); + if (error < 0) goto out_thread; svc_sock_update_bufs(serv); @@ -185,17 +495,136 @@ out_thread: } /* - * Destroy an RPC server thread + * Create a thread in the default pool. Caller must hold BKL. + */ +int +svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +{ + return __svc_create_thread(func, serv, &serv->sv_pools[0]); +} + +/* + * Choose a pool in which to create a new thread, for svc_set_num_threads + */ +static inline struct svc_pool * +choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + if (pool != NULL) + return pool; + + return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +} + +/* + * Choose a thread to kill, for svc_set_num_threads + */ +static inline struct task_struct * +choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + unsigned int i; + struct task_struct *task = NULL; + + if (pool != NULL) { + spin_lock_bh(&pool->sp_lock); + } else { + /* choose a pool in round-robin fashion */ + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_all_threads)) + goto found_pool; + spin_unlock_bh(&pool->sp_lock); + } + return NULL; + } + +found_pool: + if (!list_empty(&pool->sp_all_threads)) { + struct svc_rqst *rqstp; + + /* + * Remove from the pool->sp_all_threads list + * so we don't try to kill it again. + */ + rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); + list_del_init(&rqstp->rq_all); + task = rqstp->rq_task; + } + spin_unlock_bh(&pool->sp_lock); + + return task; +} + +/* + * Create or destroy enough new threads to make the number + * of threads the given number. If `pool' is non-NULL, applies + * only to threads in that pool, otherwise round-robins between + * all pools. Must be called with a svc_get() reference and + * the BKL held. + * + * Destroying threads relies on the service threads filling in + * rqstp->rq_task, which only the nfs ones do. Assumes the serv + * has been created using svc_create_pooled(). + * + * Based on code that used to be in nfsd_svc() but tweaked + * to be pool-aware. + */ +int +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *victim; + int error = 0; + unsigned int state = serv->sv_nrthreads-1; + + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + /* create new threads */ + while (nrservs > 0) { + nrservs--; + __module_get(serv->sv_module); + error = __svc_create_thread(serv->sv_function, serv, + choose_pool(serv, pool, &state)); + if (error < 0) { + module_put(serv->sv_module); + break; + } + } + /* destroy old threads */ + while (nrservs < 0 && + (victim = choose_victim(serv, pool, &state)) != NULL) { + send_sig(serv->sv_kill_signal, victim, 1); + nrservs++; + } + + return error; +} + +/* + * Called from a server thread as it's exiting. Caller must hold BKL. */ void svc_exit_thread(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; svc_release_buffer(rqstp); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); + + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads--; + list_del(&rqstp->rq_all); + spin_unlock_bh(&pool->sp_lock); + kfree(rqstp); /* Release the server */ @@ -215,23 +644,32 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) unsigned long flags; int i, error = 0, dummy; - progp = serv->sv_program; - - dprintk("RPC: svc_register(%s, %s, %d)\n", - progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); - if (!port) clear_thread_flag(TIF_SIGPENDING); - for (i = 0; i < progp->pg_nvers; i++) { - if (progp->pg_vers[i] == NULL) - continue; - error = rpc_register(progp->pg_prog, i, proto, port, &dummy); - if (error < 0) - break; - if (port && !dummy) { - error = -EACCES; - break; + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + + dprintk("RPC: svc_register(%s, %s, %d, %d)%s\n", + progp->pg_name, + proto == IPPROTO_UDP? "udp" : "tcp", + port, + i, + progp->pg_vers[i]->vs_hidden? + " (but not telling portmap)" : ""); + + if (progp->pg_vers[i]->vs_hidden) + continue; + + error = rpc_register(progp->pg_prog, i, proto, port, &dummy); + if (error < 0) + break; + if (port && !dummy) { + error = -EACCES; + break; + } } } @@ -248,19 +686,20 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) * Process the RPC request. */ int -svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_process(struct svc_rqst *rqstp) { struct svc_program *progp; struct svc_version *versp = NULL; /* compiler food */ struct svc_procedure *procp = NULL; struct kvec * argv = &rqstp->rq_arg.head[0]; struct kvec * resv = &rqstp->rq_res.head[0]; + struct svc_serv *serv = rqstp->rq_server; kxdrproc_t xdr; __be32 *statp; u32 dir, prog, vers, proc; __be32 auth_stat, rpc_stat; int auth_res; - __be32 *accept_statp; + __be32 *reply_statp; rpc_stat = rpc_success; @@ -270,10 +709,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) /* setup response xdr_buf. * Initially it has just one page */ - svc_take_page(rqstp); /* must succeed */ + rqstp->rq_resused = 1; resv->iov_base = page_address(rqstp->rq_respages[0]); resv->iov_len = 0; - rqstp->rq_res.pages = rqstp->rq_respages+1; + rqstp->rq_res.pages = rqstp->rq_respages + 1; rqstp->rq_res.len = 0; rqstp->rq_res.page_base = 0; rqstp->rq_res.page_len = 0; @@ -301,7 +740,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) goto err_bad_rpc; /* Save position in case we later decide to reject: */ - accept_statp = resv->iov_base + resv->iov_len; + reply_statp = resv->iov_base + resv->iov_len; svc_putnl(resv, 0); /* ACCEPT */ @@ -449,7 +888,7 @@ err_bad_auth: dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of accept status: */ - xdr_ressize_check(rqstp, accept_statp); + xdr_ressize_check(rqstp, reply_statp); svc_putnl(resv, 1); /* REJECT */ svc_putnl(resv, 1); /* AUTH_ERROR */ svc_putnl(resv, ntohl(auth_stat)); /* status */ @@ -489,3 +928,18 @@ err_bad: svc_putnl(resv, ntohl(rpc_stat)); goto sendit; } + +/* + * Return (transport-specific) limit on the rpc payload. + */ +u32 svc_max_payload(const struct svc_rqst *rqstp) +{ + int max = RPCSVC_MAXPAYLOAD_TCP; + + if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) + max = RPCSVC_MAXPAYLOAD_UDP; + if (rqstp->rq_server->sv_bufsz < max) + max = rqstp->rq_server->sv_bufsz; + return max; +} +EXPORT_SYMBOL_GPL(svc_max_payload); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 1020d54..e1bd933 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/hash.h> #include <linux/string.h> +#include <net/sock.h> #define RPCDBG_FACILITY RPCDBG_AUTH @@ -348,12 +349,9 @@ int auth_unix_forget_old(struct auth_domain *dom) struct auth_domain *auth_unix_lookup(struct in_addr addr) { - struct ip_map key, *ipm; + struct ip_map *ipm; struct auth_domain *rv; - strcpy(key.m_class, "nfsd"); - key.m_addr = addr; - ipm = ip_map_lookup("nfsd", addr); if (!ipm) @@ -378,6 +376,44 @@ void svcauth_unix_purge(void) cache_purge(&ip_map_cache); } +static inline struct ip_map * +ip_map_cached_get(struct svc_rqst *rqstp) +{ + struct ip_map *ipm = rqstp->rq_sock->sk_info_authunix; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* + * The entry has been invalidated since it was + * remembered, e.g. by a second mount from the + * same IP address. + */ + rqstp->rq_sock->sk_info_authunix = NULL; + cache_put(&ipm->h, &ip_map_cache); + return NULL; + } + cache_get(&ipm->h); + } + return ipm; +} + +static inline void +ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) +{ + struct svc_sock *svsk = rqstp->rq_sock; + + if (svsk->sk_sock->type == SOCK_STREAM && svsk->sk_info_authunix == NULL) + svsk->sk_info_authunix = ipm; /* newly cached, keep the reference */ + else + cache_put(&ipm->h, &ip_map_cache); +} + +void +svcauth_unix_info_release(void *info) +{ + struct ip_map *ipm = info; + cache_put(&ipm->h, &ip_map_cache); +} + static int svcauth_unix_set_client(struct svc_rqst *rqstp) { @@ -387,8 +423,10 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) if (rqstp->rq_proc == 0) return SVC_OK; - ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, - rqstp->rq_addr.sin_addr); + ipm = ip_map_cached_get(rqstp); + if (ipm == NULL) + ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, + rqstp->rq_addr.sin_addr); if (ipm == NULL) return SVC_DENIED; @@ -403,7 +441,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) case 0: rqstp->rq_client = &ipm->m_client->h; kref_get(&rqstp->rq_client->ref); - cache_put(&ipm->h, &ip_map_cache); + ip_map_cached_put(rqstp, ipm); break; } return SVC_OK; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5b0fe1b..b39e7e2 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/file.h> #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -45,13 +46,16 @@ /* SMP locking strategy: * - * svc_serv->sv_lock protects most stuff for that service. + * svc_pool->sp_lock protects most of the fields of that pool. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * when both need to be taken (rare), svc_serv->sv_lock is first. + * BKL protects svc_serv->sv_nrthread. + * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list + * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_BUSY can be set to 0 at any time. - * svc_sock_enqueue must be called afterwards * SK_CONN, SK_DATA, can be set or cleared at any time. * after a set, svc_sock_enqueue must be called. * after a clear, the socket must be read/accepted @@ -73,23 +77,30 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + /* - * Queue up an idle server thread. Must have serv->sv_lock held. + * Queue up an idle server thread. Must have pool->sp_lock held. * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't polute + * use as many different threads as we need, and the rest don't pollute * the cache. */ static inline void -svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) { - list_add(&rqstp->rq_list, &serv->sv_threads); + list_add(&rqstp->rq_list, &pool->sp_threads); } /* - * Dequeue an nfsd thread. Must have serv->sv_lock held. + * Dequeue an nfsd thread. Must have pool->sp_lock held. */ static inline void -svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) { list_del(&rqstp->rq_list); } @@ -140,7 +151,9 @@ static void svc_sock_enqueue(struct svc_sock *svsk) { struct svc_serv *serv = svsk->sk_server; + struct svc_pool *pool; struct svc_rqst *rqstp; + int cpu; if (!(svsk->sk_flags & ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) @@ -148,10 +161,14 @@ svc_sock_enqueue(struct svc_sock *svsk) if (test_bit(SK_DEAD, &svsk->sk_flags)) return; - spin_lock_bh(&serv->sv_lock); + cpu = get_cpu(); + pool = svc_pool_for_cpu(svsk->sk_server, cpu); + put_cpu(); + + spin_lock_bh(&pool->sp_lock); - if (!list_empty(&serv->sv_threads) && - !list_empty(&serv->sv_sockets)) + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) printk(KERN_ERR "svc_sock_enqueue: threads and sockets both waiting??\n"); @@ -161,73 +178,79 @@ svc_sock_enqueue(struct svc_sock *svsk) goto out_unlock; } - if (test_bit(SK_BUSY, &svsk->sk_flags)) { - /* Don't enqueue socket while daemon is receiving */ + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. We update SK_BUSY atomically because + * it also guards against trying to enqueue the svc_sock twice. + */ + if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while already enqueued */ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); goto out_unlock; } + BUG_ON(svsk->sk_pool != NULL); + svsk->sk_pool = pool; set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((svsk->sk_reserved + serv->sv_bufsz)*2 + if (((atomic_read(&svsk->sk_reserved) + serv->sv_bufsz)*2 > svc_sock_wspace(svsk)) && !test_bit(SK_CLOSE, &svsk->sk_flags) && !test_bit(SK_CONN, &svsk->sk_flags)) { /* Don't enqueue while not enough space for reply */ dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_bufsz, svc_sock_wspace(svsk)); + svsk->sk_pool = NULL; + clear_bit(SK_BUSY, &svsk->sk_flags); goto out_unlock; } clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - /* Mark socket as busy. It will remain in this state until the - * server has processed all pending data and put the socket back - * on the idle list. - */ - set_bit(SK_BUSY, &svsk->sk_flags); - if (!list_empty(&serv->sv_threads)) { - rqstp = list_entry(serv->sv_threads.next, + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); dprintk("svc: socket %p served by daemon %p\n", svsk->sk_sk, rqstp); - svc_serv_dequeue(serv, rqstp); + svc_thread_dequeue(pool, rqstp); if (rqstp->rq_sock) printk(KERN_ERR "svc_sock_enqueue: server %p, rq_sock=%p!\n", rqstp, rqstp->rq_sock); rqstp->rq_sock = svsk; - svsk->sk_inuse++; + atomic_inc(&svsk->sk_inuse); rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); + BUG_ON(svsk->sk_pool != pool); wake_up(&rqstp->rq_wait); } else { dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + list_add_tail(&svsk->sk_ready, &pool->sp_sockets); + BUG_ON(svsk->sk_pool != pool); } out_unlock: - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); } /* - * Dequeue the first socket. Must be called with the serv->sv_lock held. + * Dequeue the first socket. Must be called with the pool->sp_lock held. */ static inline struct svc_sock * -svc_sock_dequeue(struct svc_serv *serv) +svc_sock_dequeue(struct svc_pool *pool) { struct svc_sock *svsk; - if (list_empty(&serv->sv_sockets)) + if (list_empty(&pool->sp_sockets)) return NULL; - svsk = list_entry(serv->sv_sockets.next, + svsk = list_entry(pool->sp_sockets.next, struct svc_sock, sk_ready); list_del_init(&svsk->sk_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, svsk->sk_inuse); + svsk->sk_sk, atomic_read(&svsk->sk_inuse)); return svsk; } @@ -241,6 +264,7 @@ svc_sock_dequeue(struct svc_serv *serv) static inline void svc_sock_received(struct svc_sock *svsk) { + svsk->sk_pool = NULL; clear_bit(SK_BUSY, &svsk->sk_flags); svc_sock_enqueue(svsk); } @@ -262,10 +286,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space) if (space < rqstp->rq_reserved) { struct svc_sock *svsk = rqstp->rq_sock; - spin_lock_bh(&svsk->sk_server->sv_lock); - svsk->sk_reserved -= (rqstp->rq_reserved - space); + atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); rqstp->rq_reserved = space; - spin_unlock_bh(&svsk->sk_server->sv_lock); svc_sock_enqueue(svsk); } @@ -277,17 +299,11 @@ void svc_reserve(struct svc_rqst *rqstp, int space) static inline void svc_sock_put(struct svc_sock *svsk) { - struct svc_serv *serv = svsk->sk_server; - - spin_lock_bh(&serv->sv_lock); - if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { - spin_unlock_bh(&serv->sv_lock); + if (atomic_dec_and_test(&svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { dprintk("svc: releasing dead socket\n"); sock_release(svsk->sk_sock); kfree(svsk); } - else - spin_unlock_bh(&serv->sv_lock); } static void @@ -297,7 +313,7 @@ svc_sock_release(struct svc_rqst *rqstp) svc_release_skb(rqstp); - svc_free_allpages(rqstp); + svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -321,25 +337,33 @@ svc_sock_release(struct svc_rqst *rqstp) /* * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. */ void svc_wake_up(struct svc_serv *serv) { struct svc_rqst *rqstp; - - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_threads)) { - rqstp = list_entry(serv->sv_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_serv_dequeue(serv, rqstp); - rqstp->rq_sock = NULL; - */ - wake_up(&rqstp->rq_wait); + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); } - spin_unlock_bh(&serv->sv_lock); } /* @@ -388,7 +412,8 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) /* send head */ if (slen == xdr->head[0].iov_len) flags = 0; - len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); + len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, + xdr->head[0].iov_len, flags); if (len != xdr->head[0].iov_len) goto out; slen -= xdr->head[0].iov_len; @@ -413,8 +438,9 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) } /* send tail */ if (xdr->tail[0].iov_len) { - result = kernel_sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], - ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + result = kernel_sendpage(sock, rqstp->rq_respages[0], + ((unsigned long)xdr->tail[0].iov_base) + & (PAGE_SIZE-1), xdr->tail[0].iov_len, 0); if (result > 0) @@ -429,6 +455,56 @@ out: } /* + * Report socket names for nfsdfs + */ +static int one_sock_name(char *buf, struct svc_sock *svsk) +{ + int len; + + switch(svsk->sk_sk->sk_family) { + case AF_INET: + len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n", + svsk->sk_sk->sk_protocol==IPPROTO_UDP? + "udp" : "tcp", + NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr), + inet_sk(svsk->sk_sk)->num); + break; + default: + len = sprintf(buf, "*unknown-%d*\n", + svsk->sk_sk->sk_family); + } + return len; +} + +int +svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) +{ + struct svc_sock *svsk, *closesk = NULL; + int len = 0; + + if (!serv) + return 0; + spin_lock(&serv->sv_lock); + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + int onelen = one_sock_name(buf+len, svsk); + if (toclose && strcmp(toclose, buf+len) == 0) + closesk = svsk; + else + len += onelen; + } + spin_unlock(&serv->sv_lock); + if (closesk) + /* Should unregister with portmap, but you cannot + * unregister just one protocol... + */ + svc_delete_socket(closesk); + else if (toclose) + return -ENOENT; + return len; +} +EXPORT_SYMBOL(svc_sock_names); + +/* * Check input queue length */ static int @@ -557,7 +633,10 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space - * for one reply per thread. + * for one reply per thread. We count all threads + * rather than threads in a particular pool, which + * provides an upper bound on the number of threads + * which will access the socket. */ svc_sock_setbufsize(svsk->sk_sock, (serv->sv_nrthreads+3) * serv->sv_bufsz, @@ -631,9 +710,11 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) if (len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = len; rqstp->rq_arg.page_len = 0; + rqstp->rq_respages = rqstp->rq_pages+1; } else { rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; + rqstp->rq_respages = rqstp->rq_pages + 1 + + (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; } if (serv->sv_stats) @@ -844,7 +925,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_sock, sk_list); set_bit(SK_CLOSE, &svsk->sk_flags); - svsk->sk_inuse ++; + atomic_inc(&svsk->sk_inuse); } spin_unlock_bh(&serv->sv_lock); @@ -874,7 +955,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) struct svc_sock *svsk = rqstp->rq_sock; struct svc_serv *serv = svsk->sk_server; int len; - struct kvec vec[RPCSVC_MAXPAGES]; + struct kvec *vec; int pnum, vlen; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", @@ -902,6 +983,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. + * + * We count all threads rather than threads in a + * particular pool, which provides an upper bound + * on the number of threads which will access the socket. + * * rcvbuf just needs to be able to hold a few requests. * Normally they will be removed from the queue * as soon a a complete request arrives. @@ -967,15 +1053,17 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) len = svsk->sk_reclen; set_bit(SK_DATA, &svsk->sk_flags); + vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; vlen = PAGE_SIZE; pnum = 1; while (vlen < len) { - vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); + vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]); vec[pnum].iov_len = PAGE_SIZE; pnum++; vlen += PAGE_SIZE; } + rqstp->rq_respages = &rqstp->rq_pages[pnum]; /* Now receive data */ len = svc_recvfrom(rqstp, vec, pnum, len); @@ -1117,13 +1205,17 @@ svc_sock_update_bufs(struct svc_serv *serv) } /* - * Receive the next request on any socket. + * Receive the next request on any socket. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. */ int -svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +svc_recv(struct svc_rqst *rqstp, long timeout) { struct svc_sock *svsk =NULL; - int len; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + int len, i; int pages; struct xdr_buf *arg; DECLARE_WAITQUEUE(wait, current); @@ -1140,27 +1232,22 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) "svc_recv: service %p, wait queue active!\n", rqstp); - /* Initialize the buffers */ - /* first reclaim pages that were moved to response list */ - svc_pushback_allpages(rqstp); /* now allocate needed pages. If we get a failure, sleep briefly */ pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; - while (rqstp->rq_arghi < pages) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) { - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - continue; + for (i=0; i < pages ; i++) + while (rqstp->rq_pages[i] == NULL) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + rqstp->rq_pages[i] = p; } - rqstp->rq_argpages[rqstp->rq_arghi++] = p; - } /* Make arg->head point to first page and arg->pages point to rest */ arg = &rqstp->rq_arg; - arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); arg->head[0].iov_len = PAGE_SIZE; - rqstp->rq_argused = 1; - arg->pages = rqstp->rq_argpages + 1; + arg->pages = rqstp->rq_pages + 1; arg->page_base = 0; /* save at least one page for response */ arg->page_len = (pages-2)*PAGE_SIZE; @@ -1172,32 +1259,15 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) if (signalled()) return -EINTR; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - svsk = list_entry(serv->sv_tempsocks.next, - struct svc_sock, sk_list); - /* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ - if (get_seconds() - svsk->sk_lastrecv < 6*60 - || test_bit(SK_BUSY, &svsk->sk_flags)) - svsk = NULL; - } - if (svsk) { - set_bit(SK_BUSY, &svsk->sk_flags); - set_bit(SK_CLOSE, &svsk->sk_flags); - rqstp->rq_sock = svsk; - svsk->sk_inuse++; - } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + spin_lock_bh(&pool->sp_lock); + if ((svsk = svc_sock_dequeue(pool)) != NULL) { rqstp->rq_sock = svsk; - svsk->sk_inuse++; + atomic_inc(&svsk->sk_inuse); rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); } else { /* No data pending. Go to sleep */ - svc_serv_enqueue(serv, rqstp); + svc_thread_enqueue(pool, rqstp); /* * We have to be able to interrupt this wait @@ -1205,26 +1275,26 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) */ set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&rqstp->rq_wait, &wait); - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); schedule_timeout(timeout); try_to_freeze(); - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&pool->sp_lock); remove_wait_queue(&rqstp->rq_wait, &wait); if (!(svsk = rqstp->rq_sock)) { - svc_serv_dequeue(serv, rqstp); - spin_unlock_bh(&serv->sv_lock); + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); dprintk("svc: server %p, no data yet\n", rqstp); return signalled()? -EINTR : -EAGAIN; } } - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, socket %p, inuse=%d\n", - rqstp, svsk, svsk->sk_inuse); + dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", + rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); len = svsk->sk_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); @@ -1235,13 +1305,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); - if (test_bit(SK_TEMP, &svsk->sk_flags)) { - /* push active sockets to end of list */ - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&svsk->sk_list)) - list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); - spin_unlock_bh(&serv->sv_lock); - } + clear_bit(SK_OLD, &svsk->sk_flags); rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; rqstp->rq_chandle.defer = svc_defer; @@ -1301,6 +1365,58 @@ svc_send(struct svc_rqst *rqstp) } /* + * Timer function to close old temporary sockets, using + * a mark-and-sweep algorithm. + */ +static void +svc_age_temp_sockets(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_sock *svsk; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_sockets\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_sockets: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + svsk = list_entry(le, struct svc_sock, sk_list); + + if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) + continue; + if (atomic_read(&svsk->sk_inuse) || test_bit(SK_BUSY, &svsk->sk_flags)) + continue; + atomic_inc(&svsk->sk_inuse); + list_move(le, &to_be_aged); + set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(SK_DETACHED, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ + list_del_init(le); + svsk = list_entry(le, struct svc_sock, sk_list); + + dprintk("queuing svsk %p for closing, %lu seconds old\n", + svsk, get_seconds() - svsk->sk_lastrecv); + + /* a thread will dequeue and close it soon */ + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* * Initialize socket for RPC use and create svc_sock struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ @@ -1337,7 +1453,9 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; svsk->sk_server = serv; + atomic_set(&svsk->sk_inuse, 0); svsk->sk_lastrecv = get_seconds(); + spin_lock_init(&svsk->sk_defer_lock); INIT_LIST_HEAD(&svsk->sk_deferred); INIT_LIST_HEAD(&svsk->sk_ready); mutex_init(&svsk->sk_mutex); @@ -1353,6 +1471,13 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, set_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp sockets */ + setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } } else { clear_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_permsocks); @@ -1367,6 +1492,38 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, return svsk; } +int svc_addsock(struct svc_serv *serv, + int fd, + char *name_return, + int *proto) +{ + int err = 0; + struct socket *so = sockfd_lookup(fd, &err); + struct svc_sock *svsk = NULL; + + if (!so) + return err; + if (so->sk->sk_family != AF_INET) + err = -EAFNOSUPPORT; + else if (so->sk->sk_protocol != IPPROTO_TCP && + so->sk->sk_protocol != IPPROTO_UDP) + err = -EPROTONOSUPPORT; + else if (so->state > SS_UNCONNECTED) + err = -EISCONN; + else { + svsk = svc_setup_socket(serv, so, &err, 1); + if (svsk) + err = 0; + } + if (err) { + sockfd_put(so); + return err; + } + if (proto) *proto = so->sk->sk_protocol; + return one_sock_name(name_return, svsk); +} +EXPORT_SYMBOL_GPL(svc_addsock); + /* * Create socket for RPC service. */ @@ -1434,15 +1591,27 @@ svc_delete_socket(struct svc_sock *svsk) spin_lock_bh(&serv->sv_lock); - list_del_init(&svsk->sk_list); - list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) + list_del_init(&svsk->sk_list); + /* + * We used to delete the svc_sock from whichever list + * it's sk_ready node was on, but we don't actually + * need to. This is because the only time we're called + * while still attached to a queue, the queue itself + * is about to be destroyed (in svc_destroy). + */ if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) if (test_bit(SK_TEMP, &svsk->sk_flags)) serv->sv_tmpcnt--; - if (!svsk->sk_inuse) { + if (!atomic_read(&svsk->sk_inuse)) { spin_unlock_bh(&serv->sv_lock); - sock_release(svsk->sk_sock); + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); + if (svsk->sk_info_authunix != NULL) + svcauth_unix_info_release(svsk->sk_info_authunix); kfree(svsk); } else { spin_unlock_bh(&serv->sv_lock); @@ -1473,7 +1642,6 @@ svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) static void svc_revisit(struct cache_deferred_req *dreq, int too_many) { struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_serv *serv = dreq->owner; struct svc_sock *svsk; if (too_many) { @@ -1484,9 +1652,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) dprintk("revisit queued\n"); svsk = dr->svsk; dr->svsk = NULL; - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&svsk->sk_defer_lock); list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&svsk->sk_defer_lock); set_bit(SK_DEFERRED, &svsk->sk_flags); svc_sock_enqueue(svsk); svc_sock_put(svsk); @@ -1518,10 +1686,8 @@ svc_defer(struct cache_req *req) dr->argslen = rqstp->rq_arg.len >> 2; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); } - spin_lock_bh(&rqstp->rq_server->sv_lock); - rqstp->rq_sock->sk_inuse++; + atomic_inc(&rqstp->rq_sock->sk_inuse); dr->svsk = rqstp->rq_sock; - spin_unlock_bh(&rqstp->rq_server->sv_lock); dr->handle.revisit = svc_revisit; return &dr->handle; @@ -1541,6 +1707,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_prot = dr->prot; rqstp->rq_addr = dr->addr; rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; return dr->argslen<<2; } @@ -1548,11 +1715,10 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) { struct svc_deferred_req *dr = NULL; - struct svc_serv *serv = svsk->sk_server; if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) return NULL; - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&svsk->sk_defer_lock); clear_bit(SK_DEFERRED, &svsk->sk_flags); if (!list_empty(&svsk->sk_deferred)) { dr = list_entry(svsk->sk_deferred.next, @@ -1561,6 +1727,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) list_del_init(&dr->handle.recent); set_bit(SK_DEFERRED, &svsk->sk_flags); } - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&svsk->sk_defer_lock); return dr; } diff --git a/net/tipc/link.c b/net/tipc/link.c index 693f02e..53bc8cb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1666,8 +1666,9 @@ static void link_retransmit_failure(struct link *l_ptr, struct sk_buff *buf) char addr_string[16]; tipc_printf(TIPC_OUTPUT, "Msg seq number: %u, ", msg_seqno(msg)); - tipc_printf(TIPC_OUTPUT, "Outstanding acks: %u\n", (u32)TIPC_SKB_CB(buf)->handle); - + tipc_printf(TIPC_OUTPUT, "Outstanding acks: %lu\n", + (unsigned long) TIPC_SKB_CB(buf)->handle); + n_ptr = l_ptr->owner->next; tipc_node_lock(n_ptr); diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 6ac4e4f..d401dc8 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -41,17 +41,18 @@ static inline unsigned int __xfrm_dst_hash(xfrm_address_t *daddr, xfrm_address_t return (h ^ (h >> 16)) & hmask; } -static inline unsigned __xfrm_src_hash(xfrm_address_t *saddr, +static inline unsigned __xfrm_src_hash(xfrm_address_t *daddr, + xfrm_address_t *saddr, unsigned short family, unsigned int hmask) { unsigned int h = family; switch (family) { case AF_INET: - h ^= __xfrm4_addr_hash(saddr); + h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: - h ^= __xfrm6_addr_hash(saddr); + h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; }; return (h ^ (h >> 16)) & hmask; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b6e2e79..2a78616 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -778,8 +778,9 @@ void xfrm_policy_flush(u8 type) for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy *pol; struct hlist_node *entry; - int i; + int i, killed; + killed = 0; again1: hlist_for_each_entry(pol, entry, &xfrm_policy_inexact[dir], bydst) { @@ -790,6 +791,7 @@ void xfrm_policy_flush(u8 type) write_unlock_bh(&xfrm_policy_lock); xfrm_policy_kill(pol); + killed++; write_lock_bh(&xfrm_policy_lock); goto again1; @@ -807,13 +809,14 @@ void xfrm_policy_flush(u8 type) write_unlock_bh(&xfrm_policy_lock); xfrm_policy_kill(pol); + killed++; write_lock_bh(&xfrm_policy_lock); goto again2; } } - xfrm_policy_count[dir] = 0; + xfrm_policy_count[dir] -= killed; } atomic_inc(&flow_cache_genid); write_unlock_bh(&xfrm_policy_lock); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f927b73..39b8bf3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -63,10 +63,11 @@ static inline unsigned int xfrm_dst_hash(xfrm_address_t *daddr, return __xfrm_dst_hash(daddr, saddr, reqid, family, xfrm_state_hmask); } -static inline unsigned int xfrm_src_hash(xfrm_address_t *addr, +static inline unsigned int xfrm_src_hash(xfrm_address_t *daddr, + xfrm_address_t *saddr, unsigned short family) { - return __xfrm_src_hash(addr, family, xfrm_state_hmask); + return __xfrm_src_hash(daddr, saddr, family, xfrm_state_hmask); } static inline unsigned int @@ -92,7 +93,8 @@ static void xfrm_hash_transfer(struct hlist_head *list, nhashmask); hlist_add_head(&x->bydst, ndsttable+h); - h = __xfrm_src_hash(&x->props.saddr, x->props.family, + h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, + x->props.family, nhashmask); hlist_add_head(&x->bysrc, nsrctable+h); @@ -458,7 +460,7 @@ static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, __be32 spi, static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family) { - unsigned int h = xfrm_src_hash(saddr, family); + unsigned int h = xfrm_src_hash(daddr, saddr, family); struct xfrm_state *x; struct hlist_node *entry; @@ -587,7 +589,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, if (km_query(x, tmpl, pol) == 0) { x->km.state = XFRM_STATE_ACQ; hlist_add_head(&x->bydst, xfrm_state_bydst+h); - h = xfrm_src_hash(saddr, family); + h = xfrm_src_hash(daddr, saddr, family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); if (x->id.spi) { h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family); @@ -622,7 +624,7 @@ static void __xfrm_state_insert(struct xfrm_state *x) x->props.reqid, x->props.family); hlist_add_head(&x->bydst, xfrm_state_bydst+h); - h = xfrm_src_hash(&x->props.saddr, x->props.family); + h = xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); if (x->id.spi) { @@ -748,7 +750,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; add_timer(&x->timer); hlist_add_head(&x->bydst, xfrm_state_bydst+h); - h = xfrm_src_hash(saddr, family); + h = xfrm_src_hash(daddr, saddr, family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); wake_up(&km_waitq); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c59a78d..d54b3a7 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -211,6 +211,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, case XFRM_MODE_TRANSPORT: case XFRM_MODE_TUNNEL: case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_BEET: break; default: |