aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c28
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/atm/br2684.c78
-rw-r--r--net/ax25/ax25_subr.c11
-rw-r--r--net/bluetooth/rfcomm/core.c2
-rw-r--r--net/bluetooth/rfcomm/tty.c13
-rw-r--r--net/core/dev.c46
-rw-r--r--net/core/neighbour.c9
-rw-r--r--net/core/net_namespace.c3
-rw-r--r--net/core/pktgen.c4
-rw-r--r--net/core/rtnetlink.c3
-rw-r--r--net/core/skbuff.c5
-rw-r--r--net/core/user_dma.c2
-rw-r--r--net/dccp/ackvec.c29
-rw-r--r--net/dccp/ccids/ccid3.c27
-rw-r--r--net/dccp/ccids/lib/tfrc.c8
-rw-r--r--net/dccp/ccids/lib/tfrc.h25
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c8
-rw-r--r--net/dccp/ipv4.c7
-rw-r--r--net/dccp/ipv6.c1
-rw-r--r--net/dccp/minisocks.c8
-rw-r--r--net/dccp/options.c4
-rw-r--r--net/dccp/output.c2
-rw-r--r--net/dccp/probe.c2
-rw-r--r--net/ipv4/arp.c5
-rw-r--r--net/ipv4/devinet.c9
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_semantics.c5
-rw-r--r--net/ipv4/inet_connection_sock.c11
-rw-r--r--net/ipv4/ip_gre.c146
-rw-r--r--net/ipv4/ipip.c130
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c14
-rw-r--r--net/ipv4/raw.c11
-rw-r--r--net/ipv4/route.c4
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/tcp.c27
-rw-r--r--net/ipv4/tcp_input.c80
-rw-r--r--net/ipv4/tcp_ipv4.c14
-rw-r--r--net/ipv4/tcp_minisocks.c32
-rw-r--r--net/ipv4/tcp_output.c12
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/tunnel4.c2
-rw-r--r--net/ipv4/udp.c3
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv6/addrconf.c142
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/datagram.c50
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_input.c9
-rw-r--r--net/ipv6/ip6mr.c2
-rw-r--r--net/ipv6/ipv6_sockglue.c44
-rw-r--r--net/ipv6/ndisc.c8
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c8
-rw-r--r--net/ipv6/raw.c13
-rw-r--r--net/ipv6/route.c26
-rw-r--r--net/ipv6/sit.c133
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c1
-rw-r--r--net/ipv6/tunnel6.c2
-rw-r--r--net/ipv6/udp.c8
-rw-r--r--net/irda/af_irda.c12
-rw-r--r--net/key/af_key.c5
-rw-r--r--net/llc/llc_sap.c10
-rw-r--r--net/mac80211/cfg.c4
-rw-r--r--net/mac80211/ieee80211_i.h2
-rw-r--r--net/mac80211/main.c3
-rw-r--r--net/mac80211/mlme.c74
-rw-r--r--net/mac80211/rx.c4
-rw-r--r--net/mac80211/tx.c13
-rw-r--r--net/mac80211/util.c41
-rw-r--r--net/mac80211/wext.c28
-rw-r--r--net/mac80211/wme.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c3
-rw-r--r--net/netfilter/nf_conntrack_expect.c4
-rw-r--r--net/netfilter/nf_conntrack_extend.c9
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c22
-rw-r--r--net/netfilter/nf_log.c4
-rw-r--r--net/netfilter/xt_connlimit.c3
-rw-r--r--net/netlink/attr.c12
-rw-r--r--net/netlink/genetlink.c21
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/sch_dsmark.c6
-rw-r--r--net/sched/sch_gred.c3
-rw-r--r--net/sched/sch_hfsc.c2
-rw-r--r--net/sched/sch_htb.c23
-rw-r--r--net/sched/sch_red.c3
-rw-r--r--net/sctp/associola.c34
-rw-r--r--net/sctp/ipv6.c11
-rw-r--r--net/sctp/output.c2
-rw-r--r--net/sctp/outqueue.c120
-rw-r--r--net/sctp/protocol.c26
-rw-r--r--net/sctp/socket.c4
-rw-r--r--net/sctp/transport.c50
-rw-r--r--net/sunrpc/auth_generic.c8
-rw-r--r--net/sunrpc/svc_xprt.c23
-rw-r--r--net/sunrpc/svcauth_unix.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c35
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c186
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c177
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c419
-rw-r--r--net/unix/af_unix.c79
-rw-r--r--net/wireless/nl80211.c12
-rw-r--r--net/xfrm/xfrm_algo.c4
-rw-r--r--net/xfrm/xfrm_user.c11
105 files changed, 1391 insertions, 1426 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 2a739ad..ab2225d 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -382,6 +382,18 @@ static void vlan_sync_address(struct net_device *dev,
memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN);
}
+static void vlan_transfer_features(struct net_device *dev,
+ struct net_device *vlandev)
+{
+ unsigned long old_features = vlandev->features;
+
+ vlandev->features &= ~dev->vlan_features;
+ vlandev->features |= dev->features & dev->vlan_features;
+
+ if (old_features != vlandev->features)
+ netdev_features_change(vlandev);
+}
+
static void __vlan_device_event(struct net_device *dev, unsigned long event)
{
switch (event) {
@@ -410,10 +422,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
int i, flgs;
struct net_device *vlandev;
- if (is_vlan_dev(dev)) {
+ if (is_vlan_dev(dev))
__vlan_device_event(dev, event);
- goto out;
- }
grp = __vlan_find_group(dev);
if (!grp)
@@ -450,6 +460,18 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
}
break;
+ case NETDEV_FEAT_CHANGE:
+ /* Propagate device features to underlying device */
+ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ vlandev = vlan_group_get_device(grp, i);
+ if (!vlandev)
+ continue;
+
+ vlan_transfer_features(dev, vlandev);
+ }
+
+ break;
+
case NETDEV_DOWN:
/* Put all VLANs for this dev in the down state too. */
for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index c961f08..5d055c2 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -663,6 +663,8 @@ static int vlan_dev_init(struct net_device *dev)
(1<<__LINK_STATE_DORMANT))) |
(1<<__LINK_STATE_PRESENT);
+ dev->features |= real_dev->features & real_dev->vlan_features;
+
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 9d52ebf..05fafdc 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -188,10 +188,13 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
return 0;
}
}
- } else {
- skb_push(skb, 2);
- if (brdev->payload == p_bridged)
+ } else { /* e_vc */
+ if (brdev->payload == p_bridged) {
+ skb_push(skb, 2);
memset(skb->data, 0, 2);
+ } else { /* p_routed */
+ skb_pull(skb, ETH_HLEN);
+ }
}
skb_debug(skb);
@@ -377,11 +380,8 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
(skb->data + 6, ethertype_ipv4,
sizeof(ethertype_ipv4)) == 0)
skb->protocol = __constant_htons(ETH_P_IP);
- else {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb);
- return;
- }
+ else
+ goto error;
skb_pull(skb, sizeof(llc_oui_ipv4));
skb_reset_network_header(skb);
skb->pkt_type = PACKET_HOST;
@@ -394,44 +394,56 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
(memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
skb_pull(skb, sizeof(llc_oui_pid_pad));
skb->protocol = eth_type_trans(skb, net_dev);
- } else {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb);
- return;
- }
+ } else
+ goto error;
- } else {
- /* first 2 chars should be 0 */
- if (*((u16 *) (skb->data)) != 0) {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb);
- return;
+ } else { /* e_vc */
+ if (brdev->payload == p_routed) {
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ if (iph->version == 4)
+ skb->protocol = __constant_htons(ETH_P_IP);
+ else if (iph->version == 6)
+ skb->protocol = __constant_htons(ETH_P_IPV6);
+ else
+ goto error;
+ skb->pkt_type = PACKET_HOST;
+ } else { /* p_bridged */
+ /* first 2 chars should be 0 */
+ if (*((u16 *) (skb->data)) != 0)
+ goto error;
+ skb_pull(skb, BR2684_PAD_LEN);
+ skb->protocol = eth_type_trans(skb, net_dev);
}
- skb_pull(skb, BR2684_PAD_LEN + ETH_HLEN); /* pad, dstmac, srcmac, ethtype */
- skb->protocol = eth_type_trans(skb, net_dev);
}
#ifdef CONFIG_ATM_BR2684_IPFILTER
- if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) {
- brdev->stats.rx_dropped++;
- dev_kfree_skb(skb);
- return;
- }
+ if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb)))
+ goto dropped;
#endif /* CONFIG_ATM_BR2684_IPFILTER */
skb->dev = net_dev;
ATM_SKB(skb)->vcc = atmvcc; /* needed ? */
pr_debug("received packet's protocol: %x\n", ntohs(skb->protocol));
skb_debug(skb);
- if (unlikely(!(net_dev->flags & IFF_UP))) {
- /* sigh, interface is down */
- brdev->stats.rx_dropped++;
- dev_kfree_skb(skb);
- return;
- }
+ /* sigh, interface is down? */
+ if (unlikely(!(net_dev->flags & IFF_UP)))
+ goto dropped;
brdev->stats.rx_packets++;
brdev->stats.rx_bytes += skb->len;
memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
netif_rx(skb);
+ return;
+
+dropped:
+ brdev->stats.rx_dropped++;
+ goto free_skb;
+error:
+ brdev->stats.rx_errors++;
+free_skb:
+ dev_kfree_skb(skb);
+ return;
}
/*
@@ -518,9 +530,9 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
struct sk_buff *next = skb->next;
skb->next = skb->prev = NULL;
+ br2684_push(atmvcc, skb);
BRPRIV(skb->dev)->stats.rx_bytes -= skb->len;
BRPRIV(skb->dev)->stats.rx_packets--;
- br2684_push(atmvcc, skb);
skb = next;
}
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index d8f2157..034aa10 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -64,20 +64,15 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr)
void ax25_requeue_frames(ax25_cb *ax25)
{
- struct sk_buff *skb, *skb_prev = NULL;
+ struct sk_buff *skb;
/*
* Requeue all the un-ack-ed frames on the output queue to be picked
* up by ax25_kick called from the timer. This arrangement handles the
* possibility of an empty output queue.
*/
- while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) {
- if (skb_prev == NULL)
- skb_queue_head(&ax25->write_queue, skb);
- else
- skb_append(skb_prev, skb, &ax25->write_queue);
- skb_prev = skb;
- }
+ while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL)
+ skb_queue_head(&ax25->write_queue, skb);
}
/*
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index eb62558..0c2c937 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -423,8 +423,8 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
rfcomm_dlc_lock(d);
d->state = BT_CLOSED;
- rfcomm_dlc_unlock(d);
d->state_change(d, err);
+ rfcomm_dlc_unlock(d);
skb_queue_purge(&d->tx_queue);
rfcomm_dlc_unlink(d);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index c3f749a..c919187 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -566,11 +566,22 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
if (dlc->state == BT_CLOSED) {
if (!dev->tty) {
if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
- if (rfcomm_dev_get(dev->id) == NULL)
+ /* Drop DLC lock here to avoid deadlock
+ * 1. rfcomm_dev_get will take rfcomm_dev_lock
+ * but in rfcomm_dev_add there's lock order:
+ * rfcomm_dev_lock -> dlc lock
+ * 2. rfcomm_dev_put will deadlock if it's
+ * the last reference
+ */
+ rfcomm_dlc_unlock(dlc);
+ if (rfcomm_dev_get(dev->id) == NULL) {
+ rfcomm_dlc_lock(dlc);
return;
+ }
rfcomm_dev_del(dev);
rfcomm_dev_put(dev);
+ rfcomm_dlc_lock(dlc);
}
} else
tty_hangup(dev->tty);
diff --git a/net/core/dev.c b/net/core/dev.c
index a1607bc..c421a1f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -119,6 +119,7 @@
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
#include "net-sysfs.h"
@@ -903,7 +904,11 @@ int dev_change_name(struct net_device *dev, char *newname)
strlcpy(dev->name, newname, IFNAMSIZ);
rollback:
- device_rename(&dev->dev, dev->name);
+ err = device_rename(&dev->dev, dev->name);
+ if (err) {
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ return err;
+ }
write_lock_bh(&dev_base_lock);
hlist_del(&dev->name_hlist);
@@ -1358,6 +1363,29 @@ void netif_device_attach(struct net_device *dev)
}
EXPORT_SYMBOL(netif_device_attach);
+static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+{
+ return ((features & NETIF_F_GEN_CSUM) ||
+ ((features & NETIF_F_IP_CSUM) &&
+ protocol == htons(ETH_P_IP)) ||
+ ((features & NETIF_F_IPV6_CSUM) &&
+ protocol == htons(ETH_P_IPV6)));
+}
+
+static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
+{
+ if (can_checksum_protocol(dev->features, skb->protocol))
+ return true;
+
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ if (can_checksum_protocol(dev->features & dev->vlan_features,
+ veh->h_vlan_encapsulated_proto))
+ return true;
+ }
+
+ return false;
+}
/*
* Invalidate hardware checksum when packet is to be mangled, and
@@ -1636,14 +1664,8 @@ int dev_queue_xmit(struct sk_buff *skb)
if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_set_transport_header(skb, skb->csum_start -
skb_headroom(skb));
-
- if (!(dev->features & NETIF_F_GEN_CSUM) &&
- !((dev->features & NETIF_F_IP_CSUM) &&
- skb->protocol == htons(ETH_P_IP)) &&
- !((dev->features & NETIF_F_IPV6_CSUM) &&
- skb->protocol == htons(ETH_P_IPV6)))
- if (skb_checksum_help(skb))
- goto out_kfree_skb;
+ if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
+ goto out_kfree_skb;
}
gso:
@@ -2055,6 +2077,10 @@ int netif_receive_skb(struct sk_buff *skb)
rcu_read_lock();
+ /* Don't receive packets in an exiting network namespace */
+ if (!net_alive(dev_net(skb->dev)))
+ goto out;
+
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -3137,7 +3163,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
* Load in the correct multicast list now the flags have changed.
*/
- if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
+ if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
dev->change_rx_flags(dev, IFF_MULTICAST);
dev_set_rx_mode(dev);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5d9d713..65f01f7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1714,7 +1714,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
return nla_nest_end(skb, nest);
nla_put_failure:
- return nla_nest_cancel(skb, nest);
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
}
static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
@@ -2057,9 +2058,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
goto nla_put_failure;
}
- ci.ndm_used = now - neigh->used;
- ci.ndm_confirmed = now - neigh->confirmed;
- ci.ndm_updated = now - neigh->updated;
+ ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
+ ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+ ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 72b4c18..7c52fe2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -140,6 +140,9 @@ static void cleanup_net(struct work_struct *work)
struct pernet_operations *ops;
struct net *net;
+ /* Be very certain incoming network packets will not find us */
+ rcu_barrier();
+
net = container_of(work, struct net, work);
mutex_lock(&net_mutex);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8dca211..fdf5377 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -390,6 +390,7 @@ struct pktgen_thread {
int cpu;
wait_queue_head_t queue;
+ struct completion start_done;
};
#define REMOVE 1
@@ -3414,6 +3415,7 @@ static int pktgen_thread_worker(void *arg)
BUG_ON(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
+ complete(&t->start_done);
pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
@@ -3615,6 +3617,7 @@ static int __init pktgen_create_thread(int cpu)
INIT_LIST_HEAD(&t->if_list);
list_add_tail(&t->th_list, &pktgen_threads);
+ init_completion(&t->start_done);
p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu);
if (IS_ERR(p)) {
@@ -3639,6 +3642,7 @@ static int __init pktgen_create_thread(int cpu)
}
wake_up_process(p);
+ wait_for_completion(&t->start_done);
return 0;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cf857c4..a9a7721 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -498,7 +498,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
return nla_nest_end(skb, mx);
nla_put_failure:
- return nla_nest_cancel(skb, mx);
+ nla_nest_cancel(skb, mx);
+ return -EMSGSIZE;
}
int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5c459f2..1e556d3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1445,6 +1445,7 @@ done:
if (spd.nr_pages) {
int ret;
+ struct sock *sk = __skb->sk;
/*
* Drop the socket lock, otherwise we have reverse
@@ -1455,9 +1456,9 @@ done:
* we call into ->sendpage() with the i_mutex lock held
* and networking will grab the socket lock.
*/
- release_sock(__skb->sk);
+ release_sock(sk);
ret = splice_to_pipe(pipe, &spd);
- lock_sock(__skb->sk);
+ lock_sock(sk);
return ret;
}
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 0ad1cd5..c77aff9 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -75,7 +75,7 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
end = start + skb_shinfo(skb)->frags[i].size;
copy = end - offset;
- if ((copy = end - offset) > 0) {
+ if (copy > 0) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
struct page *page = frag->page;
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 6de4bd1..1e8be24 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -290,12 +290,12 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
while (1) {
const u8 len = dccp_ackvec_len(av, index);
- const u8 state = dccp_ackvec_state(av, index);
+ const u8 av_state = dccp_ackvec_state(av, index);
/*
* valid packets not yet in av_buf have a reserved
* entry, with a len equal to 0.
*/
- if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+ if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
len == 0 && delta == 0) { /* Found our
reserved seat! */
dccp_pr_debug("Found %llu reserved seat!\n",
@@ -325,31 +325,6 @@ out_duplicate:
return -EILSEQ;
}
-#ifdef CONFIG_IP_DCCP_DEBUG
-void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
-{
- dccp_pr_debug_cat("ACK vector len=%d, ackno=%llu |", len,
- (unsigned long long)ackno);
-
- while (len--) {
- const u8 state = (*vector & DCCP_ACKVEC_STATE_MASK) >> 6;
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
-
- dccp_pr_debug_cat("%d,%d|", state, rl);
- ++vector;
- }
-
- dccp_pr_debug_cat("\n");
-}
-
-void dccp_ackvec_print(const struct dccp_ackvec *av)
-{
- dccp_ackvector_print(av->av_buf_ackno,
- av->av_buf + av->av_buf_head,
- av->av_vec_len);
-}
-#endif
-
static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
struct dccp_ackvec_record *avr)
{
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index cd61dea..a1929f3 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -159,8 +159,8 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
- (s64)hctx->ccid3hctx_rtt >= 0) {
- hctx->ccid3hctx_x =
- max(min(2 * hctx->ccid3hctx_x, min_rate),
+ hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
+ hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
hctx->ccid3hctx_rtt));
hctx->ccid3hctx_t_ld = now;
@@ -193,22 +193,17 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
/*
* Update Window Counter using the algorithm from [RFC 4342, 8.1].
- * The algorithm is not applicable if RTT < 4 microseconds.
+ * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
*/
static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
ktime_t now)
{
- u32 quarter_rtts;
-
- if (unlikely(hctx->ccid3hctx_rtt < 4)) /* avoid divide-by-zero */
- return;
-
- quarter_rtts = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
- quarter_rtts /= hctx->ccid3hctx_rtt / 4;
+ u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
+ quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
if (quarter_rtts > 0) {
hctx->ccid3hctx_t_last_win_count = now;
- hctx->ccid3hctx_last_win_count += min_t(u32, quarter_rtts, 5);
+ hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U);
hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */
}
}
@@ -334,8 +329,14 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
hctx->ccid3hctx_t_ld = now;
} else {
- /* Sender does not have RTT sample: X_pps = 1 pkt/sec */
- hctx->ccid3hctx_x = hctx->ccid3hctx_s;
+ /*
+ * Sender does not have RTT sample:
+ * - set fallback RTT (RFC 4340, 3.4) since a RTT value
+ * is needed in several parts (e.g. window counter);
+ * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
+ */
+ hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
+ hctx->ccid3hctx_x = hctx->ccid3hctx_s;
hctx->ccid3hctx_x <<= 6;
}
ccid3_update_send_interval(hctx);
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
index d1dfbb8..97ecec0 100644
--- a/net/dccp/ccids/lib/tfrc.c
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -14,14 +14,6 @@ module_param(tfrc_debug, bool, 0444);
MODULE_PARM_DESC(tfrc_debug, "Enable debug messages");
#endif
-extern int tfrc_tx_packet_history_init(void);
-extern void tfrc_tx_packet_history_exit(void);
-extern int tfrc_rx_packet_history_init(void);
-extern void tfrc_rx_packet_history_exit(void);
-
-extern int tfrc_li_init(void);
-extern void tfrc_li_exit(void);
-
static int __init tfrc_module_init(void)
{
int rc = tfrc_li_init();
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 1fb1187..ed98575 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -15,7 +15,7 @@
* (at your option) any later version.
*/
#include <linux/types.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
#include "../../dccp.h"
/* internal includes that this module exports: */
#include "loss_interval.h"
@@ -29,21 +29,19 @@ extern int tfrc_debug;
#endif
/* integer-arithmetic divisions of type (a * 1000000)/b */
-static inline u64 scaled_div(u64 a, u32 b)
+static inline u64 scaled_div(u64 a, u64 b)
{
BUG_ON(b==0);
- a *= 1000000;
- do_div(a, b);
- return a;
+ return div64_u64(a * 1000000, b);
}
-static inline u32 scaled_div32(u64 a, u32 b)
+static inline u32 scaled_div32(u64 a, u64 b)
{
u64 result = scaled_div(a, b);
if (result > UINT_MAX) {
- DCCP_CRIT("Overflow: a(%llu)/b(%u) > ~0U",
- (unsigned long long)a, b);
+ DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
+ (unsigned long long)a, (unsigned long long)b);
return UINT_MAX;
}
return result;
@@ -58,7 +56,14 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
}
-extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern int tfrc_tx_packet_history_init(void);
+extern void tfrc_tx_packet_history_exit(void);
+extern int tfrc_rx_packet_history_init(void);
+extern void tfrc_rx_packet_history_exit(void);
+
+extern int tfrc_li_init(void);
+extern void tfrc_li_exit(void);
#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index e4e64b7..2f20a29 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -661,7 +661,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
EXPORT_SYMBOL_GPL(tfrc_calc_x);
-/*
+/**
* tfrc_calc_x_reverse_lookup - try to find p given f(p)
*
* @fvalue: function value to match, scaled by 1000000
@@ -676,11 +676,11 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
/* Error cases. */
if (fvalue < tfrc_calc_x_lookup[0][1]) {
- DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
- return tfrc_calc_x_lookup[0][1];
+ DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
+ return TFRC_SMALLEST_P;
}
if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
- DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
+ DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
return 1000000;
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b348dd7..37d27bc 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -589,7 +589,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = reqsk_alloc(&dccp_request_sock_ops);
+ req = inet_reqsk_alloc(&dccp_request_sock_ops);
if (req == NULL)
goto drop;
@@ -605,7 +605,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
ireq = inet_rsk(req);
ireq->loc_addr = ip_hdr(skb)->daddr;
ireq->rmt_addr = ip_hdr(skb)->saddr;
- ireq->opt = NULL;
/*
* Step 3: Process LISTEN state
@@ -739,8 +738,8 @@ int dccp_invalid_packet(struct sk_buff *skb)
* If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
* has short sequence numbers), drop packet and return
*/
- if (dh->dccph_type >= DCCP_PKT_DATA &&
- dh->dccph_type <= DCCP_PKT_DATAACK && dh->dccph_x == 0) {
+ if ((dh->dccph_type < DCCP_PKT_DATA ||
+ dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) {
DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
dccp_packet_name(dh->dccph_type));
return 1;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9b1129b..f7fe2a5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -421,7 +421,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
ireq6 = inet6_rsk(req);
ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
- ireq6->pktopts = NULL;
if (ipv6_opt_accepted(sk, skb) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 33ad483..66dca5b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -165,12 +165,12 @@ out_free:
/* See dccp_v4_conn_request */
newdmsk->dccpms_sequence_window = req->rcv_wnd;
- newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
- dccp_update_gsr(newsk, dreq->dreq_isr);
-
- newdp->dccps_iss = dreq->dreq_iss;
+ newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
dccp_update_gss(newsk, dreq->dreq_iss);
+ newdp->dccps_isr = dreq->dreq_isr;
+ dccp_update_gsr(newsk, dreq->dreq_isr);
+
/*
* SWL and AWL are initially adjusted so that they are not less than
* the initial Sequence Numbers received and sent, respectively:
diff --git a/net/dccp/options.c b/net/dccp/options.c
index d2a84a2..43bc24e 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -107,9 +107,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
*
* CCID-specific options are ignored during connection setup, as
* negotiation may still be in progress (see RFC 4340, 10.3).
+ * The same applies to Ack Vectors, as these depend on the CCID.
*
*/
- if (dreq != NULL && opt >= 128)
+ if (dreq != NULL && (opt >= 128 ||
+ opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
goto ignore_option;
switch (opt) {
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 1f8a9b6..fe20068 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -508,6 +508,7 @@ void dccp_send_ack(struct sock *sk)
EXPORT_SYMBOL_GPL(dccp_send_ack);
+#if 0
/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
void dccp_send_delayed_ack(struct sock *sk)
{
@@ -538,6 +539,7 @@ void dccp_send_delayed_ack(struct sock *sk)
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
+#endif
void dccp_send_sync(struct sock *sk, const u64 ackno,
const enum dccp_pkt_type pkt_type)
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 0bcdc92..81368a7 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -42,7 +42,7 @@ static int bufsize = 64 * 1024;
static const char procname[] = "dccpprobe";
-struct {
+static struct {
struct kfifo *fifo;
spinlock_t lock;
wait_queue_head_t wait;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 418862f..9b539fa 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1288,7 +1288,6 @@ static void arp_format_neigh_entry(struct seq_file *seq,
struct neighbour *n)
{
char hbuffer[HBUFFERLEN];
- const char hexbuf[] = "0123456789ABCDEF";
int k, j;
char tbuf[16];
struct net_device *dev = n->dev;
@@ -1302,8 +1301,8 @@ static void arp_format_neigh_entry(struct seq_file *seq,
else {
#endif
for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
- hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
- hbuffer[k++] = hexbuf[n->ha[j] & 15];
+ hbuffer[k++] = hex_asc_hi(n->ha[j]);
+ hbuffer[k++] = hex_asc_lo(n->ha[j]);
hbuffer[k++] = ':';
}
hbuffer[--k] = 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6848e47..79a7ef6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -90,7 +90,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .type = NLA_U32 },
[IFA_ADDRESS] = { .type = NLA_U32 },
[IFA_BROADCAST] = { .type = NLA_U32 },
- [IFA_ANYCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
};
@@ -536,9 +535,6 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
if (tb[IFA_BROADCAST])
ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
- if (tb[IFA_ANYCAST])
- ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]);
-
if (tb[IFA_LABEL])
nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
else
@@ -745,7 +741,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
break;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_broadcast = 0;
- ifa->ifa_anycast = 0;
ifa->ifa_scope = 0;
}
@@ -1113,7 +1108,6 @@ static inline size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_ADDRESS */
+ nla_total_size(4) /* IFA_LOCAL */
+ nla_total_size(4) /* IFA_BROADCAST */
- + nla_total_size(4) /* IFA_ANYCAST */
+ nla_total_size(IFNAMSIZ); /* IFA_LABEL */
}
@@ -1143,9 +1137,6 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
if (ifa->ifa_broadcast)
NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
- if (ifa->ifa_anycast)
- NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast);
-
if (ifa->ifa_label[0])
NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 0f1557a..0b2ac6a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -506,7 +506,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
[RTA_PREFSRC] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
- [RTA_PROTOINFO] = { .type = NLA_U32 },
[RTA_FLOW] = { .type = NLA_U32 },
};
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3b83c34..0d4d728 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -960,7 +960,10 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtm->rtm_dst_len = dst_len;
rtm->rtm_src_len = 0;
rtm->rtm_tos = tos;
- rtm->rtm_table = tb_id;
+ if (tb_id < 256)
+ rtm->rtm_table = tb_id;
+ else
+ rtm->rtm_table = RT_TABLE_COMPAT;
NLA_PUT_U32(skb, RTA_TABLE, tb_id);
rtm->rtm_type = type;
rtm->rtm_flags = fi->fib_flags;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 828ea21..ec83448 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
struct inet_connection_sock *icsk = inet_csk(parent);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt;
- int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
unsigned long now = jiffies;
struct request_sock **reqp, *req;
int i, budget;
@@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
}
}
+ if (queue->rskq_defer_accept)
+ max_retries = queue->rskq_defer_accept;
+
budget = 2 * (lopt->nr_table_entries / (timeout / interval));
i = lopt->clock_hand;
@@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if (req->retrans < thresh &&
- !req->rsk_ops->rtx_syn_ack(parent, req)) {
+ if ((req->retrans < thresh ||
+ (inet_rsk(req)->acked && req->retrans < max_retries))
+ && !req->rsk_ops->rtx_syn_ack(parent, req)) {
unsigned long timeo;
if (req->retrans++ == 0)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2ada033..4342cba 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -313,9 +313,8 @@ static void ipgre_tunnel_uninit(struct net_device *dev)
static void ipgre_err(struct sk_buff *skb, u32 info)
{
-#ifndef I_WISH_WORLD_WERE_PERFECT
-/* It is not :-( All the routers (except for Linux) return only
+/* All the routers (except for Linux) return only
8 bytes of packet payload. It means, that precise relaying of
ICMP in the real Internet is absolutely infeasible.
@@ -398,149 +397,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
out:
read_unlock(&ipgre_lock);
return;
-#else
- struct iphdr *iph = (struct iphdr*)dp;
- struct iphdr *eiph;
- __be16 *p = (__be16*)(dp+(iph->ihl<<2));
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- int rel_type = 0;
- int rel_code = 0;
- __be32 rel_info = 0;
- __u32 n = 0;
- __be16 flags;
- int grehlen = (iph->ihl<<2) + 4;
- struct sk_buff *skb2;
- struct flowi fl;
- struct rtable *rt;
-
- if (p[1] != htons(ETH_P_IP))
- return;
-
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
- return;
- if (flags&GRE_CSUM)
- grehlen += 4;
- if (flags&GRE_KEY)
- grehlen += 4;
- if (flags&GRE_SEQ)
- grehlen += 4;
- }
- if (len < grehlen + sizeof(struct iphdr))
- return;
- eiph = (struct iphdr*)(dp + grehlen);
-
- switch (type) {
- default:
- return;
- case ICMP_PARAMETERPROB:
- n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
- if (n < (iph->ihl<<2))
- return;
-
- /* So... This guy found something strange INSIDE encapsulated
- packet. Well, he is fool, but what can we do ?
- */
- rel_type = ICMP_PARAMETERPROB;
- n -= grehlen;
- rel_info = htonl(n << 24);
- break;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* And it is the only really necessary thing :-) */
- n = ntohs(icmp_hdr(skb)->un.frag.mtu);
- if (n < grehlen+68)
- return;
- n -= grehlen;
- /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
- if (n > ntohs(eiph->tot_len))
- return;
- rel_info = htonl(n);
- break;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe, it is just ether pollution. --ANK
- */
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return;
- break;
- }
-
- /* Prepare fake skb to feed it to icmp_send */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- return;
- dst_release(skb2->dst);
- skb2->dst = NULL;
- skb_pull(skb2, skb->data - (u8*)eiph);
- skb_reset_network_header(skb2);
-
- /* Try to guess incoming interface */
- memset(&fl, 0, sizeof(fl));
- fl.fl4_dst = eiph->saddr;
- fl.fl4_tos = RT_TOS(eiph->tos);
- fl.proto = IPPROTO_GRE;
- if (ip_route_output_key(dev_net(skb->dev), &rt, &fl)) {
- kfree_skb(skb2);
- return;
- }
- skb2->dev = rt->u.dst.dev;
-
- /* route "incoming" packet */
- if (rt->rt_flags&RTCF_LOCAL) {
- ip_rt_put(rt);
- rt = NULL;
- fl.fl4_dst = eiph->daddr;
- fl.fl4_src = eiph->saddr;
- fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
- rt->u.dst.dev->type != ARPHRD_IPGRE) {
- ip_rt_put(rt);
- kfree_skb(skb2);
- return;
- }
- } else {
- ip_rt_put(rt);
- if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
- skb2->dst->dev->type != ARPHRD_IPGRE) {
- kfree_skb(skb2);
- return;
- }
- }
-
- /* change mtu on this route */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- if (n > dst_mtu(skb2->dst)) {
- kfree_skb(skb2);
- return;
- }
- skb2->dst->ops->update_pmtu(skb2->dst, n);
- } else if (type == ICMP_TIME_EXCEEDED) {
- struct ip_tunnel *t = netdev_priv(skb2->dev);
- if (t->parms.iph.ttl) {
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- }
- }
-
- icmp_send(skb2, rel_type, rel_code, rel_info);
- kfree_skb(skb2);
-#endif
}
static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 149111f..af5cb53 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -278,9 +278,8 @@ static void ipip_tunnel_uninit(struct net_device *dev)
static int ipip_err(struct sk_buff *skb, u32 info)
{
-#ifndef I_WISH_WORLD_WERE_PERFECT
-/* It is not :-( All the routers (except for Linux) return only
+/* All the routers (except for Linux) return only
8 bytes of packet payload. It means, that precise relaying of
ICMP in the real Internet is absolutely infeasible.
*/
@@ -337,133 +336,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
out:
read_unlock(&ipip_lock);
return err;
-#else
- struct iphdr *iph = (struct iphdr*)dp;
- int hlen = iph->ihl<<2;
- struct iphdr *eiph;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- int rel_type = 0;
- int rel_code = 0;
- __be32 rel_info = 0;
- __u32 n = 0;
- struct sk_buff *skb2;
- struct flowi fl;
- struct rtable *rt;
-
- if (len < hlen + sizeof(struct iphdr))
- return 0;
- eiph = (struct iphdr*)(dp + hlen);
-
- switch (type) {
- default:
- return 0;
- case ICMP_PARAMETERPROB:
- n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
- if (n < hlen)
- return 0;
-
- /* So... This guy found something strange INSIDE encapsulated
- packet. Well, he is fool, but what can we do ?
- */
- rel_type = ICMP_PARAMETERPROB;
- rel_info = htonl((n - hlen) << 24);
- break;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return 0;
- case ICMP_FRAG_NEEDED:
- /* And it is the only really necessary thing :-) */
- n = ntohs(icmp_hdr(skb)->un.frag.mtu);
- if (n < hlen+68)
- return 0;
- n -= hlen;
- /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
- if (n > ntohs(eiph->tot_len))
- return 0;
- rel_info = htonl(n);
- break;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe, it is just ether pollution. --ANK
- */
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return 0;
- break;
- }
-
- /* Prepare fake skb to feed it to icmp_send */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- return 0;
- dst_release(skb2->dst);
- skb2->dst = NULL;
- skb_pull(skb2, skb->data - (u8*)eiph);
- skb_reset_network_header(skb2);
-
- /* Try to guess incoming interface */
- memset(&fl, 0, sizeof(fl));
- fl.fl4_daddr = eiph->saddr;
- fl.fl4_tos = RT_TOS(eiph->tos);
- fl.proto = IPPROTO_IPIP;
- if (ip_route_output_key(dev_net(skb->dev), &rt, &key)) {
- kfree_skb(skb2);
- return 0;
- }
- skb2->dev = rt->u.dst.dev;
-
- /* route "incoming" packet */
- if (rt->rt_flags&RTCF_LOCAL) {
- ip_rt_put(rt);
- rt = NULL;
- fl.fl4_daddr = eiph->daddr;
- fl.fl4_src = eiph->saddr;
- fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
- rt->u.dst.dev->type != ARPHRD_TUNNEL) {
- ip_rt_put(rt);
- kfree_skb(skb2);
- return 0;
- }
- } else {
- ip_rt_put(rt);
- if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
- skb2->dst->dev->type != ARPHRD_TUNNEL) {
- kfree_skb(skb2);
- return 0;
- }
- }
-
- /* change mtu on this route */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- if (n > dst_mtu(skb2->dst)) {
- kfree_skb(skb2);
- return 0;
- }
- skb2->dst->ops->update_pmtu(skb2->dst, n);
- } else if (type == ICMP_TIME_EXCEEDED) {
- struct ip_tunnel *t = netdev_priv(skb2->dev);
- if (t->parms.iph.ttl) {
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- }
- }
-
- icmp_send(skb2, rel_type, rel_code, rel_info);
- kfree_skb(skb2);
- return 0;
-#endif
}
static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 0457859..d2a887f 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -556,7 +556,6 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
spin_lock_bh(&nf_nat_lock);
hlist_del_rcu(&nat->bysource);
- nat->ct = NULL;
spin_unlock_bh(&nf_nat_lock);
}
@@ -570,8 +569,8 @@ static void nf_nat_move_storage(void *new, void *old)
return;
spin_lock_bh(&nf_nat_lock);
- hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
new_nat->ct = ct;
+ hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
spin_unlock_bh(&nf_nat_lock);
}
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 5daefad..7750c97 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -232,6 +232,11 @@ static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
}
}
}
+
+ /* don't trust len bigger than ctx buffer */
+ if (*len > ctx->end - ctx->pointer)
+ return 0;
+
return 1;
}
@@ -250,6 +255,10 @@ static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
if (!asn1_length_decode(ctx, &def, &len))
return 0;
+ /* primitive shall be definite, indefinite shall be constructed */
+ if (*con == ASN1_PRI && !def)
+ return 0;
+
if (def)
*eoc = ctx->pointer + len;
else
@@ -434,6 +443,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
unsigned long *optr;
size = eoc - ctx->pointer + 1;
+
+ /* first subid actually encodes first two subids */
+ if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+ return 0;
+
*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
if (*oid == NULL) {
if (net_ratelimit())
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fead049..37a1ecd 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,6 +608,14 @@ static void raw_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
+static int raw_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip_flush_pending_frames(sk);
+ release_sock(sk);
+ return 0;
+}
+
/* This gets rid of all the nasties in af_inet. -DaveM */
static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -820,6 +828,7 @@ struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
.close = raw_close,
+ .destroy = raw_destroy,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = raw_ioctl,
@@ -925,7 +934,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
srcp = inet->num;
seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
i, src, srcp, dest, destp, sp->sk_state,
atomic_read(&sp->sk_wmem_alloc),
atomic_read(&sp->sk_rmem_alloc),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 92f90ae..96be336 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -160,7 +160,7 @@ static struct dst_ops ipv4_dst_ops = {
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
- .local_out = ip_local_out,
+ .local_out = __ip_local_out,
.entry_size = sizeof(struct rtable),
.entries = ATOMIC_INIT(0),
};
@@ -1792,7 +1792,7 @@ static int __mkroute_input(struct sk_buff *skb,
if (err)
flags |= RTCF_DIRECTSRC;
- if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
+ if (out_dev == in_dev && err &&
(IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
flags |= RTCF_DOREDIRECT;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 73ba989..d182a2a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -285,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
cookie_check_timestamp(&tcp_opt);
ret = NULL;
- req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
if (!req)
goto out;
@@ -301,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->rmt_port = th->source;
ireq->loc_addr = ip_hdr(skb)->daddr;
ireq->rmt_addr = ip_hdr(skb)->saddr;
- ireq->opt = NULL;
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->rcv_wscale = tcp_opt.rcv_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f886531..fc54a48 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1227,7 +1227,14 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
copied += used;
offset += used;
}
- if (offset != skb->len)
+ /*
+ * If recv_actor drops the lock (e.g. TCP splice
+ * receive) the skb pointer might be invalid when
+ * getting here: tcp_collapse might have deleted it
+ * while aggregating skbs from the socket queue.
+ */
+ skb = tcp_recv_skb(sk, seq-1, &offset);
+ if (!skb || (offset+1 != skb->len))
break;
}
if (tcp_hdr(skb)->fin) {
@@ -2105,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
case TCP_DEFER_ACCEPT:
- if (val < 0) {
- err = -EINVAL;
- } else {
- if (val > MAX_TCP_ACCEPT_DEFERRED)
- val = MAX_TCP_ACCEPT_DEFERRED;
- icsk->icsk_accept_queue.rskq_defer_accept = val;
+ icsk->icsk_accept_queue.rskq_defer_accept = 0;
+ if (val > 0) {
+ /* Translate value in seconds to number of
+ * retransmits */
+ while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
+ val > ((TCP_TIMEOUT_INIT / HZ) <<
+ icsk->icsk_accept_queue.rskq_defer_accept))
+ icsk->icsk_accept_queue.rskq_defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept++;
}
break;
@@ -2292,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = icsk->icsk_accept_queue.rskq_defer_accept;
+ val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+ ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b54d9d3..cad73b7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1392,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
if (before(next_dup->start_seq, skip_to_seq)) {
skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count);
- tcp_sacktag_walk(skb, sk, NULL,
- next_dup->start_seq, next_dup->end_seq,
- 1, fack_count, reord, flag);
+ skb = tcp_sacktag_walk(skb, sk, NULL,
+ next_dup->start_seq, next_dup->end_seq,
+ 1, fack_count, reord, flag);
}
return skb;
@@ -2483,6 +2483,20 @@ static inline void tcp_complete_cwr(struct sock *sk)
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
+static void tcp_try_keep_open(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int state = TCP_CA_Open;
+
+ if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
+ state = TCP_CA_Disorder;
+
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
+ tp->high_seq = tp->snd_nxt;
+ }
+}
+
static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2496,15 +2510,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
tcp_enter_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
- int state = TCP_CA_Open;
-
- if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
- state = TCP_CA_Disorder;
-
- if (inet_csk(sk)->icsk_ca_state != state) {
- tcp_set_ca_state(sk, state);
- tp->high_seq = tp->snd_nxt;
- }
+ tcp_try_keep_open(sk);
tcp_moderate_cwnd(tp);
} else {
tcp_cwnd_down(sk, flag);
@@ -3310,8 +3316,11 @@ no_queue:
return 1;
old_ack:
- if (TCP_SKB_CB(skb)->sacked)
+ if (TCP_SKB_CB(skb)->sacked) {
tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+ if (icsk->icsk_ca_state == TCP_CA_Open)
+ tcp_try_keep_open(sk);
+ }
uninteresting_ack:
SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -4532,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
}
}
-static int tcp_defer_accept_check(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tp->defer_tcp_accept.request) {
- int queued_data = tp->rcv_nxt - tp->copied_seq;
- int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ?
- tcp_hdr((struct sk_buff *)
- sk->sk_receive_queue.prev)->fin : 0;
-
- if (queued_data && hasfin)
- queued_data--;
-
- if (queued_data &&
- tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
- if (sock_flag(sk, SOCK_KEEPOPEN)) {
- inet_csk_reset_keepalive_timer(sk,
- keepalive_time_when(tp));
- } else {
- inet_csk_delete_keepalive_timer(sk);
- }
-
- inet_csk_reqsk_queue_add(
- tp->defer_tcp_accept.listen_sk,
- tp->defer_tcp_accept.request,
- sk);
-
- tp->defer_tcp_accept.listen_sk->sk_data_ready(
- tp->defer_tcp_accept.listen_sk, 0);
-
- sock_put(tp->defer_tcp_accept.listen_sk);
- sock_put(sk);
- tp->defer_tcp_accept.listen_sk = NULL;
- tp->defer_tcp_accept.request = NULL;
- } else if (hasfin ||
- tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) {
- tcp_reset(sk);
- return -1;
- }
- }
- return 0;
-}
-
static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4935,8 +4901,6 @@ step5:
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
-
- tcp_defer_accept_check(sk);
return 0;
csum_error:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cd601a8..12695be 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,10 +85,6 @@
int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
-/* Check TCP sequence numbers in ICMP packets. */
-#define ICMP_MIN_LENGTH 8
-
-void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
#ifdef CONFIG_TCP_MD5SIG
static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
@@ -1285,7 +1281,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = reqsk_alloc(&tcp_request_sock_ops);
+ req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
@@ -1918,14 +1914,6 @@ int tcp_v4_destroy_sock(struct sock *sk)
sk->sk_sndmsg_page = NULL;
}
- if (tp->defer_tcp_accept.request) {
- reqsk_free(tp->defer_tcp_accept.request);
- sock_put(tp->defer_tcp_accept.listen_sk);
- sock_put(sk);
- tp->defer_tcp_accept.listen_sk = NULL;
- tp->defer_tcp_accept.request = NULL;
- }
-
atomic_dec(&tcp_sockets_allocated);
return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 019c8c1..8245247 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
- Both ends (listening sockets) accept the new incoming
- connection and try to talk to each other. 8-)
+ If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+ bare ACK. Otherwise, we create an established connection. Both
+ ends (listening sockets) accept the new incoming connection and try
+ to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
@@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ inet_rsk(req)->acked = 1;
+ return NULL;
+ }
+
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
- if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
- TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
-
- /* the accept queue handling is done is est recv slow
- * path so lets make sure to start there
- */
- tcp_sk(child)->pred_flags = 0;
- sock_hold(sk);
- sock_hold(child);
- tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
- tcp_sk(child)->defer_tcp_accept.request = req;
-
- inet_csk_reset_keepalive_timer(child,
- inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
- } else {
- inet_csk_reqsk_queue_add(sk, req, child);
- }
-
+ inet_csk_reqsk_queue_add(sk, req, child);
return child;
listen_overflow:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index debf235..ad993ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1836,7 +1836,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned int cur_mss = tcp_current_mss(sk, 0);
+ unsigned int cur_mss;
int err;
/* Inconslusive MTU probe */
@@ -1858,6 +1858,11 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
return -ENOMEM;
}
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ cur_mss = tcp_current_mss(sk, 0);
+
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
@@ -1884,9 +1889,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
(sysctl_tcp_retrans_collapse != 0))
tcp_retrans_try_collapse(sk, skb, cur_mss);
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
- return -EHOSTUNREACH; /* Routing failure or similar. */
-
/* Some Solaris stacks overoptimize and ignore the FIN on a
* retransmit when old data is attached. So strip it off
* since it is cheap to do so and saves bytes on the network.
@@ -2129,6 +2131,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+
+ TCP_INC_STATS(TCP_MIB_OUTRSTS);
}
/* WARNING: This routine must only be called when we have already sent
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4de68cf..63ed9d6 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data)
goto death;
}
- if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
- tcp_send_active_reset(sk, GFP_ATOMIC);
- goto death;
- }
-
if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
goto out;
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index d3b709a..cb1f0e8 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -97,7 +97,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
{
struct xfrm_tunnel *handler;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
for (handler = tunnel64_handlers; handler; handler = handler->next)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db1cb7c..56fcda3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -420,7 +420,7 @@ void udp_err(struct sk_buff *skb, u32 info)
/*
* Throw away all pending data and cancel the corking. Socket is locked.
*/
-static void udp_flush_pending_frames(struct sock *sk)
+void udp_flush_pending_frames(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
@@ -430,6 +430,7 @@ static void udp_flush_pending_frames(struct sock *sk)
ip_flush_pending_frames(sk);
}
}
+EXPORT_SYMBOL(udp_flush_pending_frames);
/**
* udp4_hwcsum_outgoing - handle outgoing HW checksumming
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 584e6d7..7135279 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -52,7 +52,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
IP_ECN_clear(top_iph);
top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
- 0 : XFRM_MODE_SKB_CB(skb)->frag_off;
+ 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
ip_select_ident(top_iph, dst->child, NULL);
top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e591e09..147588f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -731,8 +731,13 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
onlink = -1;
spin_lock(&ifa->lock);
- lifetime = min_t(unsigned long,
- ifa->valid_lft, 0x7fffffffUL/HZ);
+
+ lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
+ /*
+ * Note: Because this address is
+ * not permanent, lifetime <
+ * LONG_MAX / HZ here.
+ */
if (time_before(expires,
ifa->tstamp + lifetime * HZ))
expires = ifa->tstamp + lifetime * HZ;
@@ -1722,7 +1727,6 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
__u32 valid_lft;
__u32 prefered_lft;
int addr_type;
- unsigned long rt_expires;
struct inet6_dev *in6_dev;
pinfo = (struct prefix_info *) opt;
@@ -1764,41 +1768,49 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
* 2) Configure prefixes with the auto flag set
*/
- /* Avoid arithmetic overflow. Really, we could
- save rt_expires in seconds, likely valid_lft,
- but it would require division in fib gc, that it
- not good.
- */
- if (valid_lft >= 0x7FFFFFFF/HZ)
- rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ);
- else
- rt_expires = valid_lft * HZ;
-
- /*
- * We convert this (in jiffies) to clock_t later.
- * Avoid arithmetic overflow there as well.
- * Overflow can happen only if HZ < USER_HZ.
- */
- if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ)
- rt_expires = 0x7FFFFFFF / USER_HZ;
-
if (pinfo->onlink) {
struct rt6_info *rt;
+ unsigned long rt_expires;
+
+ /* Avoid arithmetic overflow. Really, we could
+ * save rt_expires in seconds, likely valid_lft,
+ * but it would require division in fib gc, that it
+ * not good.
+ */
+ if (HZ > USER_HZ)
+ rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
+ else
+ rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);
+
+ if (addrconf_finite_timeout(rt_expires))
+ rt_expires *= HZ;
+
rt = rt6_lookup(dev_net(dev), &pinfo->prefix, NULL,
dev->ifindex, 1);
if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
- if (rt->rt6i_flags&RTF_EXPIRES) {
- if (valid_lft == 0) {
- ip6_del_rt(rt);
- rt = NULL;
- } else {
- rt->rt6i_expires = jiffies + rt_expires;
- }
+ /* Autoconf prefix route */
+ if (valid_lft == 0) {
+ ip6_del_rt(rt);
+ rt = NULL;
+ } else if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ rt->rt6i_expires = jiffies + rt_expires;
+ rt->rt6i_flags |= RTF_EXPIRES;
+ } else {
+ rt->rt6i_flags &= ~RTF_EXPIRES;
+ rt->rt6i_expires = 0;
}
} else if (valid_lft) {
+ clock_t expires = 0;
+ int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
+ if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ flags |= RTF_EXPIRES;
+ expires = jiffies_to_clock_t(rt_expires);
+ }
addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
- dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
+ dev, expires, flags);
}
if (rt)
dst_release(&rt->u.dst);
@@ -2014,17 +2026,22 @@ err_exit:
* Manual configuration of address on an interface
*/
static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
- int plen, __u8 ifa_flags, __u32 prefered_lft,
+ unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
__u32 valid_lft)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
struct net_device *dev;
int scope;
- u32 flags = RTF_EXPIRES;
+ u32 flags;
+ clock_t expires;
+ unsigned long timeout;
ASSERT_RTNL();
+ if (plen > 128)
+ return -EINVAL;
+
/* check the lifetime */
if (!valid_lft || prefered_lft > valid_lft)
return -EINVAL;
@@ -2038,17 +2055,23 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
scope = ipv6_addr_scope(pfx);
- if (valid_lft == INFINITY_LIFE_TIME) {
- ifa_flags |= IFA_F_PERMANENT;
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ expires = jiffies_to_clock_t(timeout * HZ);
+ valid_lft = timeout;
+ flags = RTF_EXPIRES;
+ } else {
+ expires = 0;
flags = 0;
- } else if (valid_lft >= 0x7FFFFFFF/HZ)
- valid_lft = 0x7FFFFFFF/HZ;
+ ifa_flags |= IFA_F_PERMANENT;
+ }
- if (prefered_lft == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
- (prefered_lft != INFINITY_LIFE_TIME))
- prefered_lft = 0x7FFFFFFF/HZ;
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa_flags |= IFA_F_DEPRECATED;
+ prefered_lft = timeout;
+ }
ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags);
@@ -2060,7 +2083,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
spin_unlock_bh(&ifp->lock);
addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
- jiffies_to_clock_t(valid_lft * HZ), flags);
+ expires, flags);
/*
* Note that section 3.1 of RFC 4429 indicates
* that the Optimistic flag should not be set for
@@ -2076,12 +2099,15 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
}
static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx,
- int plen)
+ unsigned int plen)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
struct net_device *dev;
+ if (plen > 128)
+ return -EINVAL;
+
dev = __dev_get_by_index(net, ifindex);
if (!dev)
return -ENODEV;
@@ -3148,22 +3174,30 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
u32 prefered_lft, u32 valid_lft)
{
- u32 flags = RTF_EXPIRES;
+ u32 flags;
+ clock_t expires;
+ unsigned long timeout;
if (!valid_lft || (prefered_lft > valid_lft))
return -EINVAL;
- if (valid_lft == INFINITY_LIFE_TIME) {
- ifa_flags |= IFA_F_PERMANENT;
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ expires = jiffies_to_clock_t(timeout * HZ);
+ valid_lft = timeout;
+ flags = RTF_EXPIRES;
+ } else {
+ expires = 0;
flags = 0;
- } else if (valid_lft >= 0x7FFFFFFF/HZ)
- valid_lft = 0x7FFFFFFF/HZ;
+ ifa_flags |= IFA_F_PERMANENT;
+ }
- if (prefered_lft == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
- (prefered_lft != INFINITY_LIFE_TIME))
- prefered_lft = 0x7FFFFFFF/HZ;
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa_flags |= IFA_F_DEPRECATED;
+ prefered_lft = timeout;
+ }
spin_lock_bh(&ifp->lock);
ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags;
@@ -3176,7 +3210,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
ipv6_ifa_notify(0, ifp);
addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
- jiffies_to_clock_t(valid_lft * HZ), flags);
+ expires, flags);
addrconf_verify(0);
return 0;
@@ -4242,7 +4276,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev)
neigh_sysctl_register(idev->dev, idev->nd_parms, NET_IPV6,
NET_IPV6_NEIGH, "ipv6",
&ndisc_ifinfo_sysctl_change,
- NULL);
+ ndisc_ifinfo_sysctl_strategy);
__addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
idev->dev->ifindex, idev, &idev->cnf);
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3c6aafb..e84b3fd 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -191,7 +191,7 @@ lookup_protocol:
np->mcast_hops = -1;
np->mc_loop = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->ipv6only = init_net.ipv6.sysctl.bindv6only;
+ np->ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
* using v6 API for ipv4.
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 94fa6ae..0f0f94a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -496,7 +496,8 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
return 0;
}
-int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
+int datagram_send_ctl(struct net *net,
+ struct msghdr *msg, struct flowi *fl,
struct ipv6_txoptions *opt,
int *hlimit, int *tclass)
{
@@ -509,7 +510,6 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
int addr_type;
- struct net_device *dev = NULL;
if (!CMSG_OK(msg, cmsg)) {
err = -EINVAL;
@@ -522,6 +522,9 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
switch (cmsg->cmsg_type) {
case IPV6_PKTINFO:
case IPV6_2292PKTINFO:
+ {
+ struct net_device *dev = NULL;
+
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
goto exit_f;
@@ -535,32 +538,32 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
fl->oif = src_info->ipi6_ifindex;
}
- addr_type = ipv6_addr_type(&src_info->ipi6_addr);
+ addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
- if (addr_type == IPV6_ADDR_ANY)
- break;
+ if (fl->oif) {
+ dev = dev_get_by_index(net, fl->oif);
+ if (!dev)
+ return -ENODEV;
+ } else if (addr_type & IPV6_ADDR_LINKLOCAL)
+ return -EINVAL;
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (!src_info->ipi6_ifindex)
- return -EINVAL;
- else {
- dev = dev_get_by_index(&init_net, src_info->ipi6_ifindex);
- if (!dev)
- return -ENODEV;
- }
- }
- if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr,
- dev, 0)) {
- if (dev)
- dev_put(dev);
- err = -EINVAL;
- goto exit_f;
+ if (addr_type != IPV6_ADDR_ANY) {
+ int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+ if (!ipv6_chk_addr(net, &src_info->ipi6_addr,
+ strict ? dev : NULL, 0))
+ err = -EINVAL;
+ else
+ ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr);
}
+
if (dev)
dev_put(dev);
- ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr);
+ if (err)
+ goto exit_f;
+
break;
+ }
case IPV6_FLOWINFO:
if (cmsg->cmsg_len < CMSG_LEN(4)) {
@@ -702,6 +705,11 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
}
*hlimit = *(int *)CMSG_DATA(cmsg);
+ if (*hlimit < -1 || *hlimit > 0xff) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
break;
case IPV6_TCLASS:
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index eb7a940..37a4e77 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -354,7 +354,7 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval,
msg.msg_control = (void*)(fl->opt+1);
flowi.oif = 0;
- err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk, &junk);
+ err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk, &junk);
if (err)
goto done;
err = -EINVAL;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 4e5c861..17eb48b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -102,6 +102,15 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (hdr->version != 6)
goto err;
+ /*
+ * RFC4291 2.5.3
+ * A packet received on an interface with a destination address
+ * of loopback must be dropped.
+ */
+ if (!(dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_loopback(&hdr->daddr))
+ goto err;
+
skb->transport_header = skb->network_header + sizeof(*hdr);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2de3c46..1479618 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -197,7 +197,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
const char *name = vif->dev ? vif->dev->name : "none";
seq_printf(seq,
- "%2Zd %-10s %8ld %7ld %8ld %7ld %05X\n",
+ "%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
vif - vif6_table,
name, vif->bytes_in, vif->pkt_in,
vif->bytes_out, vif->pkt_out,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 56d55fe..86e28a7 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -67,7 +67,7 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
/* RA packet may be delivered ONLY to IPPROTO_RAW socket */
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW)
- return -EINVAL;
+ return -ENOPROTOOPT;
new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
@@ -161,9 +161,17 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
struct ipv6_txoptions *opt;
struct sk_buff *pktopt;
- if (sk->sk_protocol != IPPROTO_UDP &&
- sk->sk_protocol != IPPROTO_UDPLITE &&
- sk->sk_protocol != IPPROTO_TCP)
+ if (sk->sk_type == SOCK_RAW)
+ break;
+
+ if (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_UDPLITE) {
+ struct udp_sock *up = udp_sk(sk);
+ if (up->pending == AF_INET6) {
+ retv = -EBUSY;
+ break;
+ }
+ } else if (sk->sk_protocol != IPPROTO_TCP)
break;
if (sk->sk_state != TCP_ESTABLISHED) {
@@ -337,18 +345,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
case IPV6_DSTOPTS:
{
struct ipv6_txoptions *opt;
+
+ /* remove any sticky options header with a zero option
+ * length, per RFC3542.
+ */
if (optlen == 0)
optval = NULL;
+ else if (optlen < sizeof(struct ipv6_opt_hdr) ||
+ optlen & 0x7 || optlen > 8 * 255)
+ goto e_inval;
/* hop-by-hop / destination options are privileged option */
retv = -EPERM;
if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW))
break;
- if (optlen < sizeof(struct ipv6_opt_hdr) ||
- optlen & 0x7 || optlen > 8 * 255)
- goto e_inval;
-
opt = ipv6_renew_options(sk, np->opt, optname,
(struct ipv6_opt_hdr __user *)optval,
optlen);
@@ -416,7 +427,7 @@ sticky_done:
msg.msg_controllen = optlen;
msg.msg_control = (void*)(opt+1);
- retv = datagram_send_ctl(&msg, &fl, opt, &junk, &junk);
+ retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk);
if (retv)
goto done;
update:
@@ -438,7 +449,7 @@ done:
case IPV6_MULTICAST_HOPS:
if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
+ break;
if (optlen < sizeof(int))
goto e_inval;
if (val > 255 || val < -1)
@@ -450,13 +461,15 @@ done:
case IPV6_MULTICAST_LOOP:
if (optlen < sizeof(int))
goto e_inval;
+ if (val != valbool)
+ goto e_inval;
np->mc_loop = valbool;
retv = 0;
break;
case IPV6_MULTICAST_IF:
if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
+ break;
if (optlen < sizeof(int))
goto e_inval;
@@ -832,7 +845,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
len = min_t(unsigned int, len, ipv6_optlen(hdr));
if (copy_to_user(optval, hdr, len))
return -EFAULT;
- return ipv6_optlen(hdr);
+ return len;
}
static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
@@ -852,7 +865,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_protocol != IPPROTO_UDP &&
sk->sk_protocol != IPPROTO_UDPLITE &&
sk->sk_protocol != IPPROTO_TCP)
- return -EINVAL;
+ return -ENOPROTOOPT;
if (sk->sk_state != TCP_ESTABLISHED)
return -ENOTCONN;
val = sk->sk_family;
@@ -866,6 +879,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
return -EFAULT;
+ if (gsf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
lock_sock(sk);
err = ip6_mc_msfget(sk, &gsf,
(struct group_filter __user *)optval, optlen);
@@ -975,6 +990,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
len = ipv6_getsockopt_sticky(sk, np->opt,
optname, optval, len);
release_sock(sk);
+ /* check if ipv6_getsockopt_sticky() returns err code */
+ if (len < 0)
+ return len;
return put_user(len, optlen);
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a55fc05..282fdb3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1727,10 +1727,10 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
return ret;
}
-static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
- int nlen, void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen)
+int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
+ int nlen, void __user *oldval,
+ size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
{
struct net_device *dev = ctl->extra1;
struct inet6_dev *idev;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 2dccad4..e65e26e 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -209,7 +209,9 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst)
arg.dst = dst;
hash = ip6qhashfn(id, src, dst);
+ local_bh_disable();
q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash);
+ local_bh_enable();
if (q == NULL)
goto oom;
@@ -638,10 +640,10 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
goto ret_orig;
}
- spin_lock(&fq->q.lock);
+ spin_lock_bh(&fq->q.lock);
if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
- spin_unlock(&fq->q.lock);
+ spin_unlock_bh(&fq->q.lock);
pr_debug("Can't insert skb to queue\n");
fq_put(fq);
goto ret_orig;
@@ -653,7 +655,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
if (ret_skb == NULL)
pr_debug("Can't reassemble fragmented packets\n");
}
- spin_unlock(&fq->q.lock);
+ spin_unlock_bh(&fq->q.lock);
fq_put(fq);
return ret_skb;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 232e0dc..3aee123 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -813,7 +813,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
- err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
+ err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -1164,6 +1164,15 @@ static void rawv6_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
+static int raw6_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip6_flush_pending_frames(sk);
+ release_sock(sk);
+
+ return inet6_destroy_sock(sk);
+}
+
static int rawv6_init_sk(struct sock *sk)
{
struct raw6_sock *rp = raw6_sk(sk);
@@ -1187,11 +1196,11 @@ struct proto rawv6_prot = {
.name = "RAWv6",
.owner = THIS_MODULE,
.close = rawv6_close,
+ .destroy = raw6_destroy,
.connect = ip6_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = rawv6_ioctl,
.init = rawv6_init_sk,
- .destroy = inet6_destroy_sock,
.setsockopt = rawv6_setsockopt,
.getsockopt = rawv6_getsockopt,
.sendmsg = rawv6_sendmsg,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 12bba08..d1f3e19 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,7 @@ static struct dst_ops ip6_dst_ops_template = {
.negative_advice = ip6_negative_advice,
.link_failure = ip6_link_failure,
.update_pmtu = ip6_rt_update_pmtu,
- .local_out = ip6_local_out,
+ .local_out = __ip6_local_out,
.entry_size = sizeof(struct rt6_info),
.entries = ATOMIC_INIT(0),
};
@@ -446,7 +446,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
struct route_info *rinfo = (struct route_info *) opt;
struct in6_addr prefix_buf, *prefix;
unsigned int pref;
- u32 lifetime;
+ unsigned long lifetime;
struct rt6_info *rt;
if (len < sizeof(struct route_info)) {
@@ -472,13 +472,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
if (pref == ICMPV6_ROUTER_PREF_INVALID)
pref = ICMPV6_ROUTER_PREF_MEDIUM;
- lifetime = ntohl(rinfo->lifetime);
- if (lifetime == 0xffffffff) {
- /* infinity */
- } else if (lifetime > 0x7fffffff/HZ) {
- /* Avoid arithmetic overflow */
- lifetime = 0x7fffffff/HZ - 1;
- }
+ lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
if (rinfo->length == 3)
prefix = (struct in6_addr *)rinfo->prefix;
@@ -506,7 +500,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
(rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
- if (lifetime == 0xffffffff) {
+ if (!addrconf_finite_timeout(lifetime)) {
rt->rt6i_flags &= ~RTF_EXPIRES;
} else {
rt->rt6i_expires = jiffies + HZ * lifetime;
@@ -1106,7 +1100,9 @@ int ip6_route_add(struct fib6_config *cfg)
}
rt->u.dst.obsolete = -1;
- rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
+ rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
+ jiffies + clock_t_to_jiffies(cfg->fc_expires) :
+ 0;
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
@@ -2200,7 +2196,13 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
- expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
+ if (!(rt->rt6i_flags & RTF_EXPIRES))
+ expires = 0;
+ else if (rt->rt6i_expires - jiffies < INT_MAX)
+ expires = rt->rt6i_expires - jiffies;
+ else
+ expires = INT_MAX;
+
if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
expires, rt->u.dst.error) < 0)
goto nla_put_failure;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 5a6fab9..32e871a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -222,15 +222,18 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
}
-static int ipip6_tunnel_get_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
+static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
+ struct ip_tunnel_prl __user *a)
{
- struct ip_tunnel_prl *kp;
+ struct ip_tunnel_prl kprl, *kp;
struct ip_tunnel_prl_entry *prl;
unsigned int cmax, c = 0, ca, len;
int ret = 0;
- cmax = a->datalen / sizeof(*a);
- if (cmax > 1 && a->addr != htonl(INADDR_ANY))
+ if (copy_from_user(&kprl, a, sizeof(kprl)))
+ return -EFAULT;
+ cmax = kprl.datalen / sizeof(kprl);
+ if (cmax > 1 && kprl.addr != htonl(INADDR_ANY))
cmax = 1;
/* For simple GET or for root users,
@@ -261,26 +264,25 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
for (prl = t->prl; prl; prl = prl->next) {
if (c > cmax)
break;
- if (a->addr != htonl(INADDR_ANY) && prl->addr != a->addr)
+ if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr)
continue;
kp[c].addr = prl->addr;
kp[c].flags = prl->flags;
c++;
- if (a->addr != htonl(INADDR_ANY))
+ if (kprl.addr != htonl(INADDR_ANY))
break;
}
out:
read_unlock(&ipip6_lock);
len = sizeof(*kp) * c;
- ret = len ? copy_to_user(a->data, kp, len) : 0;
+ ret = 0;
+ if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen))
+ ret = -EFAULT;
kfree(kp);
- if (ret)
- return -EFAULT;
- a->datalen = len;
- return 0;
+ return ret;
}
static int
@@ -403,9 +405,8 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
static int ipip6_err(struct sk_buff *skb, u32 info)
{
-#ifndef I_WISH_WORLD_WERE_PERFECT
-/* It is not :-( All the routers (except for Linux) return only
+/* All the routers (except for Linux) return only
8 bytes of packet payload. It means, that precise relaying of
ICMP in the real Internet is absolutely infeasible.
*/
@@ -462,92 +463,6 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
out:
read_unlock(&ipip6_lock);
return err;
-#else
- struct iphdr *iph = (struct iphdr*)dp;
- int hlen = iph->ihl<<2;
- struct ipv6hdr *iph6;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- int rel_type = 0;
- int rel_code = 0;
- int rel_info = 0;
- struct sk_buff *skb2;
- struct rt6_info *rt6i;
-
- if (len < hlen + sizeof(struct ipv6hdr))
- return;
- iph6 = (struct ipv6hdr*)(dp + hlen);
-
- switch (type) {
- default:
- return;
- case ICMP_PARAMETERPROB:
- if (icmp_hdr(skb)->un.gateway < hlen)
- return;
-
- /* So... This guy found something strange INSIDE encapsulated
- packet. Well, he is fool, but what can we do ?
- */
- rel_type = ICMPV6_PARAMPROB;
- rel_info = icmp_hdr(skb)->un.gateway - hlen;
- break;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* Too complicated case ... */
- return;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe, it is just ether pollution. --ANK
- */
- rel_type = ICMPV6_DEST_UNREACH;
- rel_code = ICMPV6_ADDR_UNREACH;
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return;
- rel_type = ICMPV6_TIME_EXCEED;
- rel_code = ICMPV6_EXC_HOPLIMIT;
- break;
- }
-
- /* Prepare fake skb to feed it to icmpv6_send */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- return 0;
- dst_release(skb2->dst);
- skb2->dst = NULL;
- skb_pull(skb2, skb->data - (u8*)iph6);
- skb_reset_network_header(skb2);
-
- /* Try to guess incoming interface */
- rt6i = rt6_lookup(dev_net(skb->dev), &iph6->saddr, NULL, NULL, 0);
- if (rt6i && rt6i->rt6i_dev) {
- skb2->dev = rt6i->rt6i_dev;
-
- rt6i = rt6_lookup(dev_net(skb->dev),
- &iph6->daddr, &iph6->saddr, NULL, 0);
-
- if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) {
- struct ip_tunnel *t = netdev_priv(rt6i->rt6i_dev);
- if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) {
- rel_type = ICMPV6_DEST_UNREACH;
- rel_code = ICMPV6_ADDR_UNREACH;
- }
- icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev);
- }
- }
- kfree_skb(skb2);
- return 0;
-#endif
}
static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
@@ -960,11 +875,20 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
break;
case SIOCGETPRL:
+ err = -EINVAL;
+ if (dev == sitn->fb_tunnel_dev)
+ goto done;
+ err = -ENOENT;
+ if (!(t = netdev_priv(dev)))
+ goto done;
+ err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
+ break;
+
case SIOCADDPRL:
case SIOCDELPRL:
case SIOCCHGPRL:
err = -EPERM;
- if (cmd != SIOCGETPRL && !capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN))
goto done;
err = -EINVAL;
if (dev == sitn->fb_tunnel_dev)
@@ -977,12 +901,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
goto done;
switch (cmd) {
- case SIOCGETPRL:
- err = ipip6_tunnel_get_prl(t, &prl);
- if (!err && copy_to_user(ifr->ifr_ifru.ifru_data,
- &prl, sizeof(prl)))
- err = -EFAULT;
- break;
case SIOCDELPRL:
err = ipip6_tunnel_del_prl(t, &prl);
break;
@@ -991,8 +909,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
break;
}
- if (cmd != SIOCGETPRL)
- netdev_state_change(dev);
+ netdev_state_change(dev);
break;
default:
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 938ce4e..3ecc115 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -198,7 +198,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
ireq = inet_rsk(req);
ireq6 = inet6_rsk(req);
treq = tcp_rsk(req);
- ireq6->pktopts = NULL;
if (security_inet_conn_request(sk, skb, req)) {
reqsk_free(req);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 715965f..cb46749 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1299,7 +1299,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
treq = inet6_rsk(req);
ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr);
ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr);
- treq->pktopts = NULL;
if (!want_cookie)
TCP_ECN_create_request(req, tcp_hdr(skb));
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 6323921..669f280 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -109,7 +109,7 @@ static int tunnel46_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto drop;
for (handler = tunnel46_handlers; handler; handler = handler->next)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1fd784f..dd30962 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -534,7 +534,9 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
- if (up->pending) {
+ if (up->pending == AF_INET)
+ udp_flush_pending_frames(sk);
+ else if (up->pending) {
up->len = 0;
up->pending = 0;
ip6_flush_pending_frames(sk);
@@ -731,7 +733,7 @@ do_udp_sendmsg:
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(*opt);
- err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
+ err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -848,12 +850,14 @@ do_append_data:
} else {
dst_release(dst);
}
+ dst = NULL;
}
if (err > 0)
err = np->recverr ? net_xmit_errno(err) : 0;
release_sock(sk);
out:
+ dst_release(dst);
fl6_sock_release(flowlabel);
if (!err)
return len;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ae54b20..3eb5bcc 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1093,11 +1093,6 @@ static int irda_create(struct net *net, struct socket *sock, int protocol)
init_waitqueue_head(&self->query_wait);
- /* Initialise networking socket struct */
- sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */
- sk->sk_family = PF_IRDA;
- sk->sk_protocol = protocol;
-
switch (sock->type) {
case SOCK_STREAM:
sock->ops = &irda_stream_ops;
@@ -1124,13 +1119,20 @@ static int irda_create(struct net *net, struct socket *sock, int protocol)
self->max_sdu_size_rx = TTP_SAR_UNBOUND;
break;
default:
+ sk_free(sk);
return -ESOCKTNOSUPPORT;
}
break;
default:
+ sk_free(sk);
return -ESOCKTNOSUPPORT;
}
+ /* Initialise networking socket struct */
+ sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */
+ sk->sk_family = PF_IRDA;
+ sk->sk_protocol = protocol;
+
/* Register as a client with IrLMP */
self->ckey = irlmp_register_client(0, NULL, NULL, NULL);
self->mask.word = 0xffff;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9e7236f..7470e36 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1251,7 +1251,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
x->sel.prefixlen_s = addr->sadb_address_prefixlen;
}
- if (x->props.mode == XFRM_MODE_TRANSPORT)
+ if (!x->sel.family)
x->sel.family = x->props.family;
if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) {
@@ -3030,6 +3030,9 @@ static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
{
+ if (atomic_read(&pfkey_socks_nr) == 0)
+ return 0;
+
switch (c->event) {
case XFRM_MSG_EXPIRE:
return key_notify_sa_expire(x, c);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index e2ddde7..008de1f 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -286,12 +286,14 @@ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb,
*
* Sends received pdus to the sap state machine.
*/
-static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb)
+static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
+ struct sock *sk)
{
struct llc_sap_state_ev *ev = llc_sap_ev(skb);
ev->type = LLC_SAP_EV_TYPE_PDU;
ev->reason = 0;
+ skb->sk = sk;
llc_sap_state_process(sap, skb);
}
@@ -360,8 +362,7 @@ static void llc_sap_mcast(struct llc_sap *sap,
break;
sock_hold(sk);
- skb_set_owner_r(skb1, sk);
- llc_sap_rcv(sap, skb1);
+ llc_sap_rcv(sap, skb1, sk);
sock_put(sk);
}
read_unlock_bh(&sap->sk_list.lock);
@@ -381,8 +382,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
} else {
struct sock *sk = llc_lookup_dgram(sap, &laddr);
if (sk) {
- skb_set_owner_r(skb, sk);
- llc_sap_rcv(sap, skb);
+ llc_sap_rcv(sap, skb, sk);
sock_put(sk);
} else
kfree_skb(skb);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 699d97b..a9fce4a 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -672,7 +672,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
if (params->vlan) {
sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
- if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN ||
+ if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN &&
sdata->vif.type != IEEE80211_IF_TYPE_AP)
return -EINVAL;
} else
@@ -760,7 +760,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
if (params->vlan && params->vlan != sta->sdata->dev) {
vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
- if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN ||
+ if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN &&
vlansdata->vif.type != IEEE80211_IF_TYPE_AP) {
rcu_read_unlock();
return -EINVAL;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c7314bf..006486b 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -899,7 +899,7 @@ extern const struct iw_handler_def ieee80211_iw_handler_def;
/* ieee80211_ioctl.c */
-int ieee80211_set_freq(struct ieee80211_local *local, int freq);
+int ieee80211_set_freq(struct net_device *dev, int freq);
/* ieee80211_sta.c */
void ieee80211_sta_timer(unsigned long data);
void ieee80211_sta_work(struct work_struct *work);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 915afad..98c0b5e 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -511,6 +511,7 @@ static int ieee80211_stop(struct net_device *dev)
case IEEE80211_IF_TYPE_STA:
case IEEE80211_IF_TYPE_IBSS:
sdata->u.sta.state = IEEE80211_DISABLED;
+ memset(sdata->u.sta.bssid, 0, ETH_ALEN);
del_timer_sync(&sdata->u.sta.timer);
/*
* When we get here, the interface is marked down.
@@ -1313,7 +1314,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
/*
* Clear the TX filter mask for this STA when sending the next
* packet. If the STA went to power save mode, this will happen
- * happen when it wakes up for the next time.
+ * when it wakes up for the next time.
*/
sta->flags |= WLAN_STA_CLEAR_PS_FILT;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4adba09..4d2b582 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -44,7 +44,7 @@
#define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ)
#define IEEE80211_SCAN_INTERVAL (2 * HZ)
#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
-#define IEEE80211_IBSS_JOIN_TIMEOUT (20 * HZ)
+#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
#define IEEE80211_PROBE_DELAY (HZ / 33)
#define IEEE80211_CHANNEL_TIME (HZ / 33)
@@ -730,7 +730,17 @@ static void ieee80211_send_assoc(struct net_device *dev,
if (bss->wmm_ie) {
wmm = 1;
}
+
+ /* get all rates supported by the device and the AP as
+ * some APs don't like getting a superset of their rates
+ * in the association request (e.g. D-Link DAP 1353 in
+ * b-only mode) */
+ rates_len = ieee80211_compatible_rates(bss, sband, &rates);
+
ieee80211_rx_bss_put(dev, bss);
+ } else {
+ rates = ~0;
+ rates_len = sband->n_bitrates;
}
mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
@@ -761,10 +771,7 @@ static void ieee80211_send_assoc(struct net_device *dev,
*pos++ = ifsta->ssid_len;
memcpy(pos, ifsta->ssid, ifsta->ssid_len);
- /* all supported rates should be added here but some APs
- * (e.g. D-Link DAP 1353 in b-only mode) don't like that
- * Therefore only add rates the AP supports */
- rates_len = ieee80211_compatible_rates(bss, sband, &rates);
+ /* add all rates which were marked to be used above */
supp_rates_len = rates_len;
if (supp_rates_len > 8)
supp_rates_len = 8;
@@ -1318,7 +1325,7 @@ static void ieee80211_sta_process_addba_request(struct net_device *dev,
/* prepare reordering buffer */
tid_agg_rx->reorder_buf =
- kmalloc(buf_size * sizeof(struct sk_buf *), GFP_ATOMIC);
+ kmalloc(buf_size * sizeof(struct sk_buff *), GFP_ATOMIC);
if (!tid_agg_rx->reorder_buf) {
if (net_ratelimit())
printk(KERN_ERR "can not allocate reordering buffer "
@@ -1327,7 +1334,7 @@ static void ieee80211_sta_process_addba_request(struct net_device *dev,
goto end;
}
memset(tid_agg_rx->reorder_buf, 0,
- buf_size * sizeof(struct sk_buf *));
+ buf_size * sizeof(struct sk_buff *));
if (local->ops->ampdu_action)
ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START,
@@ -1607,7 +1614,7 @@ void sta_addba_resp_timer_expired(unsigned long data)
* only one argument, and both sta_info and TID are needed, so init
* flow in sta_info_create gives the TID as data, while the timer_to_id
* array gives the sta through container_of */
- u16 tid = *(int *)data;
+ u16 tid = *(u8 *)data;
struct sta_info *temp_sta = container_of((void *)data,
struct sta_info, timer_to_tid[tid]);
@@ -1655,7 +1662,7 @@ timer_expired_exit:
void sta_rx_agg_session_timer_expired(unsigned long data)
{
/* not an elegant detour, but there is no choice as the timer passes
- * only one argument, and verious sta_info are needed here, so init
+ * only one argument, and various sta_info are needed here, so init
* flow in sta_info_create gives the TID as data, while the timer_to_id
* array gives the sta through container_of */
u8 *ptid = (u8 *)data;
@@ -2329,6 +2336,7 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
u8 *pos;
struct ieee80211_sub_if_data *sdata;
struct ieee80211_supported_band *sband;
+ union iwreq_data wrqu;
sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
@@ -2351,13 +2359,10 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
sdata->drop_unencrypted = bss->capability &
WLAN_CAPABILITY_PRIVACY ? 1 : 0;
- res = ieee80211_set_freq(local, bss->freq);
+ res = ieee80211_set_freq(dev, bss->freq);
- if (local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS) {
- printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
- "%d MHz\n", dev->name, local->oper_channel->center_freq);
- return -1;
- }
+ if (res)
+ return res;
/* Set beacon template */
skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
@@ -2472,7 +2477,9 @@ static int ieee80211_sta_join_ibss(struct net_device *dev,
ifsta->state = IEEE80211_IBSS_JOINED;
mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
- ieee80211_rx_bss_put(dev, bss);
+ memset(&wrqu, 0, sizeof(wrqu));
+ memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN);
+ wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
return res;
}
@@ -3446,21 +3453,17 @@ static int ieee80211_sta_config_auth(struct net_device *dev,
struct ieee80211_sta_bss *bss, *selected = NULL;
int top_rssi = 0, freq;
- if (!(ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
- IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL))) {
- ifsta->state = IEEE80211_AUTHENTICATE;
- ieee80211_sta_reset_auth(dev, ifsta);
- return 0;
- }
-
spin_lock_bh(&local->sta_bss_lock);
freq = local->oper_channel->center_freq;
list_for_each_entry(bss, &local->sta_bss_list, list) {
if (!(bss->capability & WLAN_CAPABILITY_ESS))
continue;
- if (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
- !!sdata->default_key)
+ if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+ IEEE80211_STA_AUTO_BSSID_SEL |
+ IEEE80211_STA_AUTO_CHANNEL_SEL)) &&
+ (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^
+ !!sdata->default_key))
continue;
if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) &&
@@ -3485,7 +3488,7 @@ static int ieee80211_sta_config_auth(struct net_device *dev,
spin_unlock_bh(&local->sta_bss_lock);
if (selected) {
- ieee80211_set_freq(local, selected->freq);
+ ieee80211_set_freq(dev, selected->freq);
if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
ieee80211_sta_set_ssid(dev, selected->ssid,
selected->ssid_len);
@@ -3520,6 +3523,7 @@ static int ieee80211_sta_create_ibss(struct net_device *dev,
struct ieee80211_supported_band *sband;
u8 bssid[ETH_ALEN], *pos;
int i;
+ int ret;
DECLARE_MAC_BUF(mac);
#if 0
@@ -3564,7 +3568,9 @@ static int ieee80211_sta_create_ibss(struct net_device *dev,
*pos++ = (u8) (rate / 5);
}
- return ieee80211_sta_join_ibss(dev, ifsta, bss);
+ ret = ieee80211_sta_join_ibss(dev, ifsta, bss);
+ ieee80211_rx_bss_put(dev, bss);
+ return ret;
}
@@ -3612,10 +3618,13 @@ static int ieee80211_sta_find_ibss(struct net_device *dev,
(bss = ieee80211_rx_bss_get(dev, bssid,
local->hw.conf.channel->center_freq,
ifsta->ssid, ifsta->ssid_len))) {
+ int ret;
printk(KERN_DEBUG "%s: Selected IBSS BSSID %s"
" based on configured SSID\n",
dev->name, print_mac(mac, bssid));
- return ieee80211_sta_join_ibss(dev, ifsta, bss);
+ ret = ieee80211_sta_join_ibss(dev, ifsta, bss);
+ ieee80211_rx_bss_put(dev, bss);
+ return ret;
}
#ifdef CONFIG_MAC80211_IBSS_DEBUG
printk(KERN_DEBUG " did not try to join ibss\n");
@@ -4092,18 +4101,17 @@ ieee80211_sta_scan_result(struct net_device *dev,
memset(&iwe, 0, sizeof(iwe));
iwe.cmd = SIOCGIWFREQ;
- iwe.u.freq.m = bss->freq;
- iwe.u.freq.e = 6;
+ iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
+ iwe.u.freq.e = 0;
current_ev = iwe_stream_add_event(current_ev, end_buf, &iwe,
IW_EV_FREQ_LEN);
memset(&iwe, 0, sizeof(iwe));
iwe.cmd = SIOCGIWFREQ;
- iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
- iwe.u.freq.e = 0;
+ iwe.u.freq.m = bss->freq;
+ iwe.u.freq.e = 6;
current_ev = iwe_stream_add_event(current_ev, end_buf, &iwe,
IW_EV_FREQ_LEN);
-
memset(&iwe, 0, sizeof(iwe));
iwe.cmd = IWEVQUAL;
iwe.u.qual.qual = bss->signal;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1958bfb..0941e5d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1091,7 +1091,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
u16 fc, hdrlen, ethertype;
u8 *payload;
u8 dst[ETH_ALEN];
- u8 src[ETH_ALEN];
+ u8 src[ETH_ALEN] __aligned(2);
struct sk_buff *skb = rx->skb;
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
DECLARE_MAC_BUF(mac);
@@ -1234,7 +1234,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
*/
static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx)
{
- static const u8 pae_group_addr[ETH_ALEN]
+ static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
= { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d7dd54..c80d589 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1132,7 +1132,7 @@ static int ieee80211_tx(struct net_device *dev, struct sk_buff *skb,
ieee80211_tx_handler *handler;
struct ieee80211_tx_data tx;
ieee80211_tx_result res = TX_DROP, res_prepare;
- int ret, i;
+ int ret, i, retries = 0;
WARN_ON(__ieee80211_queue_pending(local, control->queue));
@@ -1216,6 +1216,13 @@ retry:
if (!__ieee80211_queue_stopped(local, control->queue)) {
clear_bit(IEEE80211_LINK_STATE_PENDING,
&local->state[control->queue]);
+ retries++;
+ /*
+ * Driver bug, it's rejecting packets but
+ * not stopping queues.
+ */
+ if (WARN_ON_ONCE(retries > 5))
+ goto drop;
goto retry;
}
memcpy(&store->control, control,
@@ -1562,13 +1569,13 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
* be cloned. This could happen, e.g., with Linux bridge code passing
* us broadcast frames. */
- if (head_need > 0 || skb_header_cloned(skb)) {
+ if (head_need > 0 || skb_cloned(skb)) {
#if 0
printk(KERN_DEBUG "%s: need to reallocate buffer for %d bytes "
"of headroom\n", dev->name, head_need);
#endif
- if (skb_header_cloned(skb))
+ if (skb_cloned(skb))
I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
else
I802_DEBUG_INC(local->tx_expand_skb_head);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 24a465c..4e97b26 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -34,11 +34,11 @@ void *mac80211_wiphy_privid = &mac80211_wiphy_privid;
/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
-const unsigned char rfc1042_header[] =
+const unsigned char rfc1042_header[] __aligned(2) =
{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
-const unsigned char bridge_tunnel_header[] =
+const unsigned char bridge_tunnel_header[] __aligned(2) =
{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
@@ -389,6 +389,41 @@ void ieee80211_iterate_active_interfaces(
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_sub_if_data *sdata;
+ rtnl_lock();
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ switch (sdata->vif.type) {
+ case IEEE80211_IF_TYPE_INVALID:
+ case IEEE80211_IF_TYPE_MNTR:
+ case IEEE80211_IF_TYPE_VLAN:
+ continue;
+ case IEEE80211_IF_TYPE_AP:
+ case IEEE80211_IF_TYPE_STA:
+ case IEEE80211_IF_TYPE_IBSS:
+ case IEEE80211_IF_TYPE_WDS:
+ case IEEE80211_IF_TYPE_MESH_POINT:
+ break;
+ }
+ if (sdata->dev == local->mdev)
+ continue;
+ if (netif_running(sdata->dev))
+ iterator(data, sdata->dev->dev_addr,
+ &sdata->vif);
+ }
+
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces);
+
+void ieee80211_iterate_active_interfaces_atomic(
+ struct ieee80211_hw *hw,
+ void (*iterator)(void *data, u8 *mac,
+ struct ieee80211_vif *vif),
+ void *data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata;
+
rcu_read_lock();
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
@@ -413,4 +448,4 @@ void ieee80211_iterate_active_interfaces(
rcu_read_unlock();
}
-EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces);
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 76e1de1..6106cb7 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -209,7 +209,6 @@ static int ieee80211_ioctl_giwrange(struct net_device *dev,
range->num_frequency = c;
IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
- IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWTHRSPY);
IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
@@ -291,14 +290,22 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev,
return 0;
}
-int ieee80211_set_freq(struct ieee80211_local *local, int freqMHz)
+int ieee80211_set_freq(struct net_device *dev, int freqMHz)
{
int ret = -EINVAL;
struct ieee80211_channel *chan;
+ struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
chan = ieee80211_get_channel(local->hw.wiphy, freqMHz);
if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) {
+ if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS &&
+ chan->flags & IEEE80211_CHAN_NO_IBSS) {
+ printk(KERN_DEBUG "%s: IBSS not allowed on frequency "
+ "%d MHz\n", dev->name, chan->center_freq);
+ return ret;
+ }
local->oper_channel = chan;
if (local->sta_sw_scanning || local->sta_hw_scanning)
@@ -316,7 +323,6 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev,
struct iw_request_info *info,
struct iw_freq *freq, char *extra)
{
- struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
if (sdata->vif.type == IEEE80211_IF_TYPE_STA)
@@ -330,14 +336,14 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev,
IEEE80211_STA_AUTO_CHANNEL_SEL;
return 0;
} else
- return ieee80211_set_freq(local,
+ return ieee80211_set_freq(dev,
ieee80211_channel_to_frequency(freq->m));
} else {
int i, div = 1000000;
for (i = 0; i < freq->e; i++)
div /= 10;
if (div > 0)
- return ieee80211_set_freq(local, freq->m / div);
+ return ieee80211_set_freq(dev, freq->m / div);
else
return -EINVAL;
}
@@ -490,9 +496,15 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
if (sdata->vif.type == IEEE80211_IF_TYPE_STA ||
sdata->vif.type == IEEE80211_IF_TYPE_IBSS) {
- ap_addr->sa_family = ARPHRD_ETHER;
- memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
- return 0;
+ if (sdata->u.sta.state == IEEE80211_ASSOCIATED ||
+ sdata->u.sta.state == IEEE80211_IBSS_JOINED) {
+ ap_addr->sa_family = ARPHRD_ETHER;
+ memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
+ return 0;
+ } else {
+ memset(&ap_addr->sa_data, 0, ETH_ALEN);
+ return 0;
+ }
} else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) {
ap_addr->sa_family = ARPHRD_ETHER;
memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN);
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index dc1598b..635b996 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -673,7 +673,7 @@ int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
#ifdef CONFIG_MAC80211_HT_DEBUG
if (net_ratelimit())
printk(KERN_DEBUG "allocated aggregation queue"
- " %d tid %d addr %s pool=0x%lX",
+ " %d tid %d addr %s pool=0x%lX\n",
i, tid, print_mac(mac, sta->addr),
q->qdisc_pool[0]);
#endif /* CONFIG_MAC80211_HT_DEBUG */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c4b1799..662c1cc 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -196,8 +196,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
if (l4proto && l4proto->destroy)
l4proto->destroy(ct);
- nf_ct_ext_destroy(ct);
-
rcu_read_unlock();
spin_lock_bh(&nf_conntrack_lock);
@@ -520,6 +518,7 @@ static void nf_conntrack_free_rcu(struct rcu_head *head)
void nf_conntrack_free(struct nf_conn *ct)
{
+ nf_ct_ext_destroy(ct);
call_rcu(&ct->rcu, nf_conntrack_free_rcu);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index e31beeb..e8f0dea 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -587,10 +587,10 @@ int __init nf_conntrack_expect_init(void)
return 0;
err3:
+ kmem_cache_destroy(nf_ct_expect_cachep);
+err2:
nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
nf_ct_expect_hsize);
-err2:
- kmem_cache_destroy(nf_ct_expect_cachep);
err1:
return err;
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bcc19fa..8a3f8b3 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -59,12 +59,19 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
if (!*ext)
return NULL;
+ INIT_RCU_HEAD(&(*ext)->rcu);
(*ext)->offset[id] = off;
(*ext)->len = len;
return (void *)(*ext) + off;
}
+static void __nf_ct_ext_free_rcu(struct rcu_head *head)
+{
+ struct nf_ct_ext *ext = container_of(head, struct nf_ct_ext, rcu);
+ kfree(ext);
+}
+
void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
struct nf_ct_ext *new;
@@ -106,7 +113,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
(void *)ct->ext + ct->ext->offset[i]);
rcu_read_unlock();
}
- kfree(ct->ext);
+ call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu);
ct->ext = new;
}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 95da1a2..2f83c15 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -619,6 +619,7 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = {
static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
.name = "H.245",
.me = THIS_MODULE,
+ .tuple.src.l3num = AF_UNSPEC,
.tuple.dst.protonum = IPPROTO_UDP,
.help = h245_help,
.expect_policy = &h245_exp_policy,
@@ -1765,6 +1766,7 @@ static void __exit nf_conntrack_h323_fini(void)
nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
kfree(h323_buffer);
pr_debug("nf_ct_h323: fini\n");
}
@@ -1777,28 +1779,34 @@ static int __init nf_conntrack_h323_init(void)
h323_buffer = kmalloc(65536, GFP_KERNEL);
if (!h323_buffer)
return -ENOMEM;
- ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245);
if (ret < 0)
goto err1;
- ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
if (ret < 0)
goto err2;
- ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
if (ret < 0)
goto err3;
- ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
if (ret < 0)
goto err4;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+ if (ret < 0)
+ goto err5;
pr_debug("nf_ct_h323: init success\n");
return 0;
-err4:
+err5:
nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
-err3:
+err4:
nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
-err2:
+err3:
nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+err2:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
err1:
+ kfree(h323_buffer);
return ret;
}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index bc11d70..9fda6ee 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -92,10 +92,6 @@ void nf_log_packet(int pf,
vsnprintf(prefix, sizeof(prefix), fmt, args);
va_end(args);
logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
- } else if (net_ratelimit()) {
- printk(KERN_WARNING "nf_log_packet: can\'t log since "
- "no backend logging module loaded in! Please either "
- "load one, or disable logging explicitly\n");
}
rcu_read_unlock();
}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 2e89a00..70907f6 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -73,7 +73,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,
static inline bool already_closed(const struct nf_conn *conn)
{
if (nf_ct_protonum(conn) == IPPROTO_TCP)
- return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT;
+ return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
+ conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
else
return 0;
}
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
index feb326f..47bbf45 100644
--- a/net/netlink/attr.c
+++ b/net/netlink/attr.c
@@ -400,13 +400,13 @@ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
- * Returns -1 if the tailroom of the skb is insufficient to store
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute header and payload.
*/
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
{
if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
- return -1;
+ return -EMSGSIZE;
__nla_put(skb, attrtype, attrlen, data);
return 0;
@@ -418,13 +418,13 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
- * Returns -1 if the tailroom of the skb is insufficient to store
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute payload.
*/
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
- return -1;
+ return -EMSGSIZE;
__nla_put_nohdr(skb, attrlen, data);
return 0;
@@ -436,13 +436,13 @@ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
- * Returns -1 if the tailroom of the skb is insufficient to store
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute payload.
*/
int nla_append(struct sk_buff *skb, int attrlen, const void *data)
{
if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
- return -1;
+ return -EMSGSIZE;
memcpy(skb_put(skb, attrlen), data, attrlen);
return 0;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index d16929c..3e1191c 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -444,8 +444,11 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (ops->dumpit == NULL)
return -EOPNOTSUPP;
- return netlink_dump_start(genl_sock, skb, nlh,
- ops->dumpit, ops->done);
+ genl_unlock();
+ err = netlink_dump_start(genl_sock, skb, nlh,
+ ops->dumpit, ops->done);
+ genl_lock();
+ return err;
}
if (ops->doit == NULL)
@@ -554,7 +557,8 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
return genlmsg_end(skb, hdr);
nla_put_failure:
- return genlmsg_cancel(skb, hdr);
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
}
static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid,
@@ -590,7 +594,8 @@ static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid,
return genlmsg_end(skb, hdr);
nla_put_failure:
- return genlmsg_cancel(skb, hdr);
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
}
static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
@@ -601,9 +606,6 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
int chains_to_skip = cb->args[0];
int fams_to_skip = cb->args[1];
- if (chains_to_skip != 0)
- genl_lock();
-
for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
if (i < chains_to_skip)
continue;
@@ -621,9 +623,6 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
}
errout:
- if (chains_to_skip != 0)
- genl_unlock();
-
cb->args[0] = i;
cb->args[1] = n;
@@ -768,7 +767,7 @@ static int __init genl_init(void)
/* we'll bump the group number right afterwards */
genl_sock = netlink_kernel_create(&init_net, NETLINK_GENERIC, 0,
- genl_rcv, NULL, THIS_MODULE);
+ genl_rcv, &genl_mutex, THIS_MODULE);
if (genl_sock == NULL)
panic("GENL: Cannot initialize generic netlink\n");
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1086df7..9360fc8 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -220,7 +220,7 @@ replay:
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
if (tp == NULL)
goto errout;
- err = -EINVAL;
+ err = -ENOENT;
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
if (tp_ops == NULL) {
#ifdef CONFIG_KMOD
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 0df911f..64465ba 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -444,7 +444,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
return nla_nest_end(skb, opts);
nla_put_failure:
- return nla_nest_cancel(skb, opts);
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
}
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -466,7 +467,8 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
return nla_nest_end(skb, opts);
nla_put_failure:
- return nla_nest_cancel(skb, opts);
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
}
static const struct Qdisc_class_ops dsmark_class_ops = {
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 3a9d226..c89fba5 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -582,7 +582,8 @@ append_opt:
return nla_nest_end(skb, opts);
nla_put_failure:
- return nla_nest_cancel(skb, opts);
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
}
static void gred_destroy(struct Qdisc *sch)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 87293d0..fdfaa3f 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1360,7 +1360,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
nla_put_failure:
nla_nest_cancel(skb, nest);
- return -1;
+ return -EMSGSIZE;
}
static int
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 5bc1ed4..6807c97 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -28,6 +28,7 @@
* $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
*/
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -53,13 +54,17 @@
*/
#define HTB_HSIZE 16 /* classid hash size */
-#define HTB_HYSTERESIS 1 /* whether to use mode hysteresis for speedup */
+static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
#if HTB_VER >> 16 != TC_HTB_PROTOVER
#error "Mismatched sch_htb.c and pkt_sch.h"
#endif
+/* Module parameter and sysfs export */
+module_param (htb_hysteresis, int, 0640);
+MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+
/* used internaly to keep status of single class */
enum htb_cmode {
HTB_CANT_SEND, /* class can't send and can't borrow */
@@ -462,19 +467,21 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
htb_remove_class_from_row(q, cl, mask);
}
-#if HTB_HYSTERESIS
static inline long htb_lowater(const struct htb_class *cl)
{
- return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+ if (htb_hysteresis)
+ return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+ else
+ return 0;
}
static inline long htb_hiwater(const struct htb_class *cl)
{
- return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+ if (htb_hysteresis)
+ return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+ else
+ return 0;
}
-#else
-#define htb_lowater(cl) (0)
-#define htb_hiwater(cl) (0)
-#endif
+
/**
* htb_class_mode - computes and returns current class mode
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3dcd493..5c56985 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -281,7 +281,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
return nla_nest_end(skb, opts);
nla_put_failure:
- return nla_nest_cancel(skb, opts);
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
}
static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index b4cd2b7..024c3eb 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -474,6 +474,15 @@ static void sctp_association_destroy(struct sctp_association *asoc)
void sctp_assoc_set_primary(struct sctp_association *asoc,
struct sctp_transport *transport)
{
+ int changeover = 0;
+
+ /* it's a changeover only if we already have a primary path
+ * that we are changing
+ */
+ if (asoc->peer.primary_path != NULL &&
+ asoc->peer.primary_path != transport)
+ changeover = 1 ;
+
asoc->peer.primary_path = transport;
/* Set a default msg_name for events. */
@@ -499,12 +508,12 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
* double switch to the same destination address.
*/
if (transport->cacc.changeover_active)
- transport->cacc.cycling_changeover = 1;
+ transport->cacc.cycling_changeover = changeover;
/* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that
* a changeover has occurred.
*/
- transport->cacc.changeover_active = 1;
+ transport->cacc.changeover_active = changeover;
/* 3) The sender MUST store the next TSN to be sent in
* next_tsn_at_change.
@@ -1203,6 +1212,9 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
struct list_head *head = &asoc->peer.transport_addr_list;
struct list_head *pos;
+ if (asoc->peer.transport_count == 1)
+ return;
+
/* Find the next transport in a round-robin fashion. */
t = asoc->peer.retran_path;
pos = &t->transports;
@@ -1217,6 +1229,15 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
t = list_entry(pos, struct sctp_transport, transports);
+ /* We have exhausted the list, but didn't find any
+ * other active transports. If so, use the next
+ * transport.
+ */
+ if (t == asoc->peer.retran_path) {
+ t = next;
+ break;
+ }
+
/* Try to find an active transport. */
if ((t->state == SCTP_ACTIVE) ||
@@ -1229,15 +1250,6 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
if (!next)
next = t;
}
-
- /* We have exhausted the list, but didn't find any
- * other active transports. If so, use the next
- * transport.
- */
- if (t == asoc->peer.retran_path) {
- t = next;
- break;
- }
}
asoc->peer.retran_path = t;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e45e44c..a2f4d4d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -299,7 +299,8 @@ static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
/* Fills in the source address(saddr) based on the destination address(daddr)
* and asoc's bind address list.
*/
-static void sctp_v6_get_saddr(struct sctp_association *asoc,
+static void sctp_v6_get_saddr(struct sctp_sock *sk,
+ struct sctp_association *asoc,
struct dst_entry *dst,
union sctp_addr *daddr,
union sctp_addr *saddr)
@@ -318,7 +319,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
if (!asoc) {
ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL,
&daddr->v6.sin6_addr,
- inet6_sk(asoc->base.sk)->srcprefs,
+ inet6_sk(&sk->inet.sk)->srcprefs,
&saddr->v6.sin6_addr);
SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " NIP6_FMT "\n",
NIP6(saddr->v6.sin6_addr));
@@ -726,6 +727,11 @@ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
seq_printf(seq, NIP6_FMT " ", NIP6(addr->v6.sin6_addr));
}
+static void sctp_v6_ecn_capable(struct sock *sk)
+{
+ inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
+}
+
/* Initialize a PF_INET6 socket msg_name. */
static void sctp_inet6_msgname(char *msgname, int *addr_len)
{
@@ -996,6 +1002,7 @@ static struct sctp_af sctp_af_inet6 = {
.skb_iif = sctp_v6_skb_iif,
.is_ce = sctp_v6_is_ce,
.seq_dump_addr = sctp_v6_seq_dump_addr,
+ .ecn_capable = sctp_v6_ecn_capable,
.net_header_len = sizeof(struct ipv6hdr),
.sockaddr_len = sizeof(struct sockaddr_in6),
#ifdef CONFIG_COMPAT
diff --git a/net/sctp/output.c b/net/sctp/output.c
index cf4f9fb..6d45bae 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -548,7 +548,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
* Note: The works for IPv6 layer checks this bit too later
* in transmission. See IP6_ECN_flow_xmit().
*/
- INET_ECN_xmit(nskb->sk);
+ (*tp->af_specific->ecn_capable)(nskb->sk);
/* Set up the IP options. */
/* BUG: not implemented
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 59edfd2..ace6770 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -208,6 +208,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
INIT_LIST_HEAD(&q->sacked);
INIT_LIST_HEAD(&q->abandoned);
+ q->fast_rtx = 0;
q->outstanding_bytes = 0;
q->empty = 1;
q->cork = 0;
@@ -500,6 +501,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
case SCTP_RTXR_FAST_RTX:
SCTP_INC_STATS(SCTP_MIB_FAST_RETRANSMITS);
sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX);
+ q->fast_rtx = 1;
break;
case SCTP_RTXR_PMTUD:
SCTP_INC_STATS(SCTP_MIB_PMTUD_RETRANSMITS);
@@ -518,9 +520,15 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
* the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by
* following the procedures outlined in C1 - C5.
*/
- sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+ if (reason == SCTP_RTXR_T3_RTX)
+ sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
- error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+ /* Flush the queues only on timeout, since fast_rtx is only
+ * triggered during sack processing and the queue
+ * will be flushed at the end.
+ */
+ if (reason != SCTP_RTXR_FAST_RTX)
+ error = sctp_outq_flush(q, /* rtx_timeout */ 1);
if (error)
q->asoc->base.sk->sk_err = -error;
@@ -538,17 +546,23 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
int rtx_timeout, int *start_timer)
{
struct list_head *lqueue;
- struct list_head *lchunk;
struct sctp_transport *transport = pkt->transport;
sctp_xmit_t status;
struct sctp_chunk *chunk, *chunk1;
struct sctp_association *asoc;
+ int fast_rtx;
int error = 0;
+ int timer = 0;
+ int done = 0;
asoc = q->asoc;
lqueue = &q->retransmit;
+ fast_rtx = q->fast_rtx;
- /* RFC 2960 6.3.3 Handle T3-rtx Expiration
+ /* This loop handles time-out retransmissions, fast retransmissions,
+ * and retransmissions due to opening of whindow.
+ *
+ * RFC 2960 6.3.3 Handle T3-rtx Expiration
*
* E3) Determine how many of the earliest (i.e., lowest TSN)
* outstanding DATA chunks for the address for which the
@@ -563,12 +577,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
* [Just to be painfully clear, if we are retransmitting
* because a timeout just happened, we should send only ONE
* packet of retransmitted data.]
+ *
+ * For fast retransmissions we also send only ONE packet. However,
+ * if we are just flushing the queue due to open window, we'll
+ * try to send as much as possible.
*/
- lchunk = sctp_list_dequeue(lqueue);
-
- while (lchunk) {
- chunk = list_entry(lchunk, struct sctp_chunk,
- transmitted_list);
+ list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) {
/* Make sure that Gap Acked TSNs are not retransmitted. A
* simple approach is just to move such TSNs out of the
@@ -576,58 +590,60 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
* next chunk.
*/
if (chunk->tsn_gap_acked) {
- list_add_tail(lchunk, &transport->transmitted);
- lchunk = sctp_list_dequeue(lqueue);
+ list_del(&chunk->transmitted_list);
+ list_add_tail(&chunk->transmitted_list,
+ &transport->transmitted);
continue;
}
+ /* If we are doing fast retransmit, ignore non-fast_rtransmit
+ * chunks
+ */
+ if (fast_rtx && !chunk->fast_retransmit)
+ continue;
+
/* Attempt to append this chunk to the packet. */
status = sctp_packet_append_chunk(pkt, chunk);
switch (status) {
case SCTP_XMIT_PMTU_FULL:
/* Send this packet. */
- if ((error = sctp_packet_transmit(pkt)) == 0)
- *start_timer = 1;
+ error = sctp_packet_transmit(pkt);
/* If we are retransmitting, we should only
* send a single packet.
*/
- if (rtx_timeout) {
- list_add(lchunk, lqueue);
- lchunk = NULL;
- }
+ if (rtx_timeout || fast_rtx)
+ done = 1;
- /* Bundle lchunk in the next round. */
+ /* Bundle next chunk in the next round. */
break;
case SCTP_XMIT_RWND_FULL:
/* Send this packet. */
- if ((error = sctp_packet_transmit(pkt)) == 0)
- *start_timer = 1;
+ error = sctp_packet_transmit(pkt);
/* Stop sending DATA as there is no more room
* at the receiver.
*/
- list_add(lchunk, lqueue);
- lchunk = NULL;
+ done = 1;
break;
case SCTP_XMIT_NAGLE_DELAY:
/* Send this packet. */
- if ((error = sctp_packet_transmit(pkt)) == 0)
- *start_timer = 1;
+ error = sctp_packet_transmit(pkt);
/* Stop sending DATA because of nagle delay. */
- list_add(lchunk, lqueue);
- lchunk = NULL;
+ done = 1;
break;
default:
/* The append was successful, so add this chunk to
* the transmitted list.
*/
- list_add_tail(lchunk, &transport->transmitted);
+ list_del(&chunk->transmitted_list);
+ list_add_tail(&chunk->transmitted_list,
+ &transport->transmitted);
/* Mark the chunk as ineligible for fast retransmit
* after it is retransmitted.
@@ -635,27 +651,44 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
if (chunk->fast_retransmit > 0)
chunk->fast_retransmit = -1;
- *start_timer = 1;
- q->empty = 0;
+ /* Force start T3-rtx timer when fast retransmitting
+ * the earliest outstanding TSN
+ */
+ if (!timer && fast_rtx &&
+ ntohl(chunk->subh.data_hdr->tsn) ==
+ asoc->ctsn_ack_point + 1)
+ timer = 2;
- /* Retrieve a new chunk to bundle. */
- lchunk = sctp_list_dequeue(lqueue);
+ q->empty = 0;
break;
}
- /* If we are here due to a retransmit timeout or a fast
- * retransmit and if there are any chunks left in the retransmit
- * queue that could not fit in the PMTU sized packet, they need
- * to be marked as ineligible for a subsequent fast retransmit.
- */
- if (rtx_timeout && !lchunk) {
- list_for_each_entry(chunk1, lqueue, transmitted_list) {
- if (chunk1->fast_retransmit > 0)
- chunk1->fast_retransmit = -1;
- }
+ /* Set the timer if there were no errors */
+ if (!error && !timer)
+ timer = 1;
+
+ if (done)
+ break;
+ }
+
+ /* If we are here due to a retransmit timeout or a fast
+ * retransmit and if there are any chunks left in the retransmit
+ * queue that could not fit in the PMTU sized packet, they need
+ * to be marked as ineligible for a subsequent fast retransmit.
+ */
+ if (rtx_timeout || fast_rtx) {
+ list_for_each_entry(chunk1, lqueue, transmitted_list) {
+ if (chunk1->fast_retransmit > 0)
+ chunk1->fast_retransmit = -1;
}
}
+ *start_timer = timer;
+
+ /* Clear fast retransmit hint */
+ if (fast_rtx)
+ q->fast_rtx = 0;
+
return error;
}
@@ -862,7 +895,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
rtx_timeout, &start_timer);
if (start_timer)
- sctp_transport_reset_timers(transport);
+ sctp_transport_reset_timers(transport,
+ start_timer-1);
/* This can happen on COOKIE-ECHO resend. Only
* one chunk can get bundled with a COOKIE-ECHO.
@@ -977,7 +1011,7 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
list_add_tail(&chunk->transmitted_list,
&transport->transmitted);
- sctp_transport_reset_timers(transport);
+ sctp_transport_reset_timers(transport, start_timer-1);
q->empty = 0;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 0ec234b..9258dfe 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -108,14 +108,23 @@ static __init int sctp_proc_init(void)
}
if (sctp_snmp_proc_init())
- goto out_nomem;
+ goto out_snmp_proc_init;
if (sctp_eps_proc_init())
- goto out_nomem;
+ goto out_eps_proc_init;
if (sctp_assocs_proc_init())
- goto out_nomem;
+ goto out_assocs_proc_init;
return 0;
+out_assocs_proc_init:
+ sctp_eps_proc_exit();
+out_eps_proc_init:
+ sctp_snmp_proc_exit();
+out_snmp_proc_init:
+ if (proc_net_sctp) {
+ proc_net_sctp = NULL;
+ remove_proc_entry("sctp", init_net.proc_net);
+ }
out_nomem:
return -ENOMEM;
}
@@ -470,11 +479,11 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
/* Walk through the bind address list and look for a bind
* address that matches the source address of the returned dst.
*/
+ sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
continue;
- sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
goto out_unlock;
}
@@ -519,7 +528,8 @@ out:
/* For v4, the source address is cached in the route entry(dst). So no need
* to cache it separately and hence this is an empty routine.
*/
-static void sctp_v4_get_saddr(struct sctp_association *asoc,
+static void sctp_v4_get_saddr(struct sctp_sock *sk,
+ struct sctp_association *asoc,
struct dst_entry *dst,
union sctp_addr *daddr,
union sctp_addr *saddr)
@@ -616,6 +626,11 @@ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
seq_printf(seq, "%d.%d.%d.%d ", NIPQUAD(addr->v4.sin_addr));
}
+static void sctp_v4_ecn_capable(struct sock *sk)
+{
+ INET_ECN_xmit(sk);
+}
+
/* Event handler for inet address addition/deletion events.
* The sctp_local_addr_list needs to be protocted by a spin lock since
* multiple notifiers (say IPv4 and IPv6) may be running at the same
@@ -934,6 +949,7 @@ static struct sctp_af sctp_af_inet = {
.skb_iif = sctp_v4_skb_iif,
.is_ce = sctp_v4_is_ce,
.seq_dump_addr = sctp_v4_seq_dump_addr,
+ .ecn_capable = sctp_v4_ecn_capable,
.net_header_len = sizeof(struct iphdr),
.sockaddr_len = sizeof(struct sockaddr_in),
#ifdef CONFIG_COMPAT
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e7e3baf..0dbcde6 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4401,7 +4401,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
if (copy_from_user(&getaddrs, optval, len))
return -EFAULT;
- if (getaddrs.addr_num <= 0) return -EINVAL;
+ if (getaddrs.addr_num <= 0 ||
+ getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr)))
+ return -EINVAL;
/*
* For UDP-style sockets, id specifies the association to query.
* If the id field is set to the value '0' then the locally bound
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f4938f6..3f34f61 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -79,6 +79,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
peer->rttvar = 0;
peer->srtt = 0;
peer->rto_pending = 0;
+ peer->fast_recovery = 0;
peer->last_time_heard = jiffies;
peer->last_time_used = jiffies;
@@ -190,7 +191,7 @@ static void sctp_transport_destroy(struct sctp_transport *transport)
/* Start T3_rtx timer if it is not already running and update the heartbeat
* timer. This routine is called every time a DATA chunk is sent.
*/
-void sctp_transport_reset_timers(struct sctp_transport *transport)
+void sctp_transport_reset_timers(struct sctp_transport *transport, int force)
{
/* RFC 2960 6.3.2 Retransmission Timer Rules
*
@@ -200,7 +201,7 @@ void sctp_transport_reset_timers(struct sctp_transport *transport)
* address.
*/
- if (!timer_pending(&transport->T3_rtx_timer))
+ if (force || !timer_pending(&transport->T3_rtx_timer))
if (!mod_timer(&transport->T3_rtx_timer,
jiffies + transport->rto))
sctp_transport_hold(transport);
@@ -291,7 +292,7 @@ void sctp_transport_route(struct sctp_transport *transport,
if (saddr)
memcpy(&transport->saddr, saddr, sizeof(union sctp_addr));
else
- af->get_saddr(asoc, dst, daddr, &transport->saddr);
+ af->get_saddr(opt, asoc, dst, daddr, &transport->saddr);
transport->dst = dst;
if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
@@ -403,11 +404,16 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
cwnd = transport->cwnd;
flight_size = transport->flight_size;
+ /* See if we need to exit Fast Recovery first */
+ if (transport->fast_recovery &&
+ TSN_lte(transport->fast_recovery_exit, sack_ctsn))
+ transport->fast_recovery = 0;
+
/* The appropriate cwnd increase algorithm is performed if, and only
- * if the cumulative TSN has advanced and the congestion window is
+ * if the cumulative TSN whould advanced and the congestion window is
* being fully utilized.
*/
- if ((transport->asoc->ctsn_ack_point >= sack_ctsn) ||
+ if (TSN_lte(sack_ctsn, transport->asoc->ctsn_ack_point) ||
(flight_size < cwnd))
return;
@@ -416,17 +422,23 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
pmtu = transport->asoc->pathmtu;
if (cwnd <= ssthresh) {
- /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
- * than or equal to ssthresh an SCTP endpoint MUST use the
- * slow start algorithm to increase cwnd only if the current
- * congestion window is being fully utilized and an incoming
- * SACK advances the Cumulative TSN Ack Point. Only when these
- * two conditions are met can the cwnd be increased otherwise
- * the cwnd MUST not be increased. If these conditions are met
- * then cwnd MUST be increased by at most the lesser of
- * 1) the total size of the previously outstanding DATA
- * chunk(s) acknowledged, and 2) the destination's path MTU.
+ /* RFC 4960 7.2.1
+ * o When cwnd is less than or equal to ssthresh, an SCTP
+ * endpoint MUST use the slow-start algorithm to increase
+ * cwnd only if the current congestion window is being fully
+ * utilized, an incoming SACK advances the Cumulative TSN
+ * Ack Point, and the data sender is not in Fast Recovery.
+ * Only when these three conditions are met can the cwnd be
+ * increased; otherwise, the cwnd MUST not be increased.
+ * If these conditions are met, then cwnd MUST be increased
+ * by, at most, the lesser of 1) the total size of the
+ * previously outstanding DATA chunk(s) acknowledged, and
+ * 2) the destination's path MTU. This upper bound protects
+ * against the ACK-Splitting attack outlined in [SAVAGE99].
*/
+ if (transport->fast_recovery)
+ return;
+
if (bytes_acked > pmtu)
cwnd += pmtu;
else
@@ -502,6 +514,13 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* cwnd = ssthresh
* partial_bytes_acked = 0
*/
+ if (transport->fast_recovery)
+ return;
+
+ /* Mark Fast recovery */
+ transport->fast_recovery = 1;
+ transport->fast_recovery_exit = transport->asoc->next_tsn - 1;
+
transport->ssthresh = max(transport->cwnd/2,
4*transport->asoc->pathmtu);
transport->cwnd = transport->ssthresh;
@@ -586,6 +605,7 @@ void sctp_transport_reset(struct sctp_transport *t)
t->flight_size = 0;
t->error_count = 0;
t->rto_pending = 0;
+ t->fast_recovery = 0;
/* Initialize the state information for SFR-CACC */
t->cacc.changeover_active = 0;
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index d927d9f..744b79f 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -17,8 +17,8 @@
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-#define RPC_ANONYMOUS_USERID ((uid_t)-2)
-#define RPC_ANONYMOUS_GROUPID ((gid_t)-2)
+#define RPC_MACHINE_CRED_USERID ((uid_t)0)
+#define RPC_MACHINE_CRED_GROUPID ((gid_t)0)
struct generic_cred {
struct rpc_cred gc_base;
@@ -44,8 +44,8 @@ EXPORT_SYMBOL_GPL(rpc_lookup_cred);
struct rpc_cred *rpc_lookup_machine_cred(void)
{
struct auth_cred acred = {
- .uid = RPC_ANONYMOUS_USERID,
- .gid = RPC_ANONYMOUS_GROUPID,
+ .uid = RPC_MACHINE_CRED_USERID,
+ .gid = RPC_MACHINE_CRED_GROUPID,
.machine_cred = 1,
};
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index d8e8d79..e46c825 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -6,30 +6,9 @@
#include <linux/sched.h>
#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <net/sock.h>
-#include <net/checksum.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/tcp_states.h>
-#include <linux/uaccess.h>
-#include <asm/ioctls.h>
-
-#include <linux/sunrpc/types.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svc_xprt.h>
@@ -296,8 +275,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
if (!(xprt->xpt_flags &
((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
return;
- if (test_bit(XPT_DEAD, &xprt->xpt_flags))
- return;
cpu = get_cpu();
pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 3f30ee6..f24800f 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -278,7 +278,7 @@ static int ip_map_show(struct seq_file *m,
dom = im->m_client->h.name;
if (ipv6_addr_v4mapped(&addr)) {
- seq_printf(m, "%s" NIPQUAD_FMT "%s\n",
+ seq_printf(m, "%s " NIPQUAD_FMT " %s\n",
im->m_class,
ntohl(addr.s6_addr32[3]) >> 24 & 0xff,
ntohl(addr.s6_addr32[3]) >> 16 & 0xff,
@@ -286,7 +286,7 @@ static int ip_map_show(struct seq_file *m,
ntohl(addr.s6_addr32[3]) >> 0 & 0xff,
dom);
} else {
- seq_printf(m, "%s" NIP6_FMT "%s\n",
+ seq_printf(m, "%s " NIP6_FMT " %s\n",
im->m_class, NIP6(addr), dom);
}
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 88c0ca2..8710117 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod;
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
/*
* This function implements reading and resetting an atomic_t stat
* variable through read/write to a proc file. Any write to the file
@@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = {
void svc_rdma_cleanup(void)
{
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+ flush_scheduled_work();
if (svcrdma_table_header) {
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
}
svc_unreg_xprt_class(&svc_rdma_class);
+ kmem_cache_destroy(svc_rdma_map_cachep);
+ kmem_cache_destroy(svc_rdma_ctxt_cachep);
}
int svc_rdma_init(void)
@@ -255,9 +262,37 @@ int svc_rdma_init(void)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
+ /* Create the temporary map cache */
+ svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+ sizeof(struct svc_rdma_req_map),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!svc_rdma_map_cachep) {
+ printk(KERN_INFO "Could not allocate map cache.\n");
+ goto err0;
+ }
+
+ /* Create the temporary context cache */
+ svc_rdma_ctxt_cachep =
+ kmem_cache_create("svc_rdma_ctxt_cache",
+ sizeof(struct svc_rdma_op_ctxt),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!svc_rdma_ctxt_cachep) {
+ printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+ goto err1;
+ }
+
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
return 0;
+ err1:
+ kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+ unregister_sysctl_table(svcrdma_table_header);
+ return -ENOMEM;
}
MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
MODULE_DESCRIPTION("SVC RDMA Transport");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c22d6b6..b4b17f4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -112,11 +112,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0].iov_len = 0;
}
-struct chunk_sge {
- int start; /* sge no for this chunk */
- int count; /* sge count for this chunk */
-};
-
/* Encode a read-chunk-list as an array of IB SGE
*
* Assumptions:
@@ -134,8 +129,8 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head,
struct rpcrdma_msg *rmsgp,
- struct ib_sge *sge,
- struct chunk_sge *ch_sge_ary,
+ struct svc_rdma_req_map *rpl_map,
+ struct svc_rdma_req_map *chl_map,
int ch_count,
int byte_count)
{
@@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
head->arg.head[0] = rqstp->rq_arg.head[0];
head->arg.tail[0] = rqstp->rq_arg.tail[0];
head->arg.pages = &head->pages[head->count];
- head->sge[0].length = head->count; /* save count of hdr pages */
+ head->hdr_count = head->count; /* save count of hdr pages */
head->arg.page_base = 0;
head->arg.page_len = ch_bytes;
head->arg.len = rqstp->rq_arg.len + ch_bytes;
head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
head->count++;
- ch_sge_ary[0].start = 0;
+ chl_map->ch[0].start = 0;
while (byte_count) {
+ rpl_map->sge[sge_no].iov_base =
+ page_address(rqstp->rq_arg.pages[page_no]) + page_off;
sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
- sge[sge_no].addr =
- ib_dma_map_page(xprt->sc_cm_id->device,
- rqstp->rq_arg.pages[page_no],
- page_off, sge_bytes,
- DMA_FROM_DEVICE);
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ rpl_map->sge[sge_no].iov_len = sge_bytes;
/*
* Don't bump head->count here because the same page
* may be used by multiple SGE.
@@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
* SGE, move to the next SGE
*/
if (ch_bytes == 0) {
- ch_sge_ary[ch_no].count =
- sge_no - ch_sge_ary[ch_no].start;
+ chl_map->ch[ch_no].count =
+ sge_no - chl_map->ch[ch_no].start;
ch_no++;
ch++;
- ch_sge_ary[ch_no].start = sge_no;
+ chl_map->ch[ch_no].start = sge_no;
ch_bytes = ch->rc_target.rs_length;
/* If bytes remaining account for next chunk */
if (byte_count) {
@@ -220,18 +211,25 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
return sge_no;
}
-static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
- struct ib_sge *sge,
+static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct kvec *vec,
u64 *sgl_offset,
int count)
{
int i;
ctxt->count = count;
+ ctxt->direction = DMA_FROM_DEVICE;
for (i = 0; i < count; i++) {
- ctxt->sge[i].addr = sge[i].addr;
- ctxt->sge[i].length = sge[i].length;
- *sgl_offset = *sgl_offset + sge[i].length;
+ atomic_inc(&xprt->sc_dma_used);
+ ctxt->sge[i].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ vec[i].iov_base, vec[i].iov_len,
+ DMA_FROM_DEVICE);
+ ctxt->sge[i].length = vec[i].iov_len;
+ ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
+ *sgl_offset = *sgl_offset + vec[i].iov_len;
}
}
@@ -260,11 +258,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
* On our side, we need to read into a pagelist. The first page immediately
* follows the RPC header.
*
- * This function returns 1 to indicate success. The data is not yet in
+ * This function returns:
+ * 0 - No error and no read-list found.
+ *
+ * 1 - Successful read-list processing. The data is not yet in
* the pagelist and therefore the RPC request must be deferred. The
* I/O completion will enqueue the transport again and
* svc_rdma_recvfrom will complete the request.
*
+ * <0 - Error processing/posting read-list.
+ *
* NOTE: The ctxt must not be touched after the last WR has been posted
* because the I/O completion processing may occur on another
* processor and free / modify the context. Ne touche pas!
@@ -277,50 +280,38 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
struct ib_send_wr read_wr;
int err = 0;
int ch_no;
- struct ib_sge *sge;
int ch_count;
int byte_count;
int sge_count;
u64 sgl_offset;
struct rpcrdma_read_chunk *ch;
struct svc_rdma_op_ctxt *ctxt = NULL;
- struct svc_rdma_op_ctxt *head;
- struct svc_rdma_op_ctxt *tmp_sge_ctxt;
- struct svc_rdma_op_ctxt *tmp_ch_ctxt;
- struct chunk_sge *ch_sge_ary;
+ struct svc_rdma_req_map *rpl_map;
+ struct svc_rdma_req_map *chl_map;
/* If no read list is present, return 0 */
ch = svc_rdma_get_read_chunk(rmsgp);
if (!ch)
return 0;
- /* Allocate temporary contexts to keep SGE */
- BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
- tmp_sge_ctxt = svc_rdma_get_context(xprt);
- sge = tmp_sge_ctxt->sge;
- tmp_ch_ctxt = svc_rdma_get_context(xprt);
- ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
+ /* Allocate temporary reply and chunk maps */
+ rpl_map = svc_rdma_get_req_map();
+ chl_map = svc_rdma_get_req_map();
svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+ if (ch_count > RPCSVC_MAXPAGES)
+ return -EINVAL;
sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
- sge, ch_sge_ary,
+ rpl_map, chl_map,
ch_count, byte_count);
- head = svc_rdma_get_context(xprt);
sgl_offset = 0;
ch_no = 0;
for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
ch->rc_discrim != 0; ch++, ch_no++) {
next_sge:
- if (!ctxt)
- ctxt = head;
- else {
- ctxt->next = svc_rdma_get_context(xprt);
- ctxt = ctxt->next;
- }
- ctxt->next = NULL;
+ ctxt = svc_rdma_get_context(xprt);
ctxt->direction = DMA_FROM_DEVICE;
- clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
/* Prepare READ WR */
@@ -333,50 +324,46 @@ next_sge:
read_wr.wr.rdma.remote_addr =
get_unaligned(&(ch->rc_target.rs_offset)) +
sgl_offset;
- read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
+ read_wr.sg_list = ctxt->sge;
read_wr.num_sge =
- rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
- rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
+ rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+ rdma_set_ctxt_sge(xprt, ctxt,
+ &rpl_map->sge[chl_map->ch[ch_no].start],
&sgl_offset,
read_wr.num_sge);
if (((ch+1)->rc_discrim == 0) &&
- (read_wr.num_sge == ch_sge_ary[ch_no].count)) {
+ (read_wr.num_sge == chl_map->ch[ch_no].count)) {
/*
* Mark the last RDMA_READ with a bit to
* indicate all RPC data has been fetched from
* the client and the RPC needs to be enqueued.
*/
set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
- ctxt->next = hdr_ctxt;
- hdr_ctxt->next = head;
+ ctxt->read_hdr = hdr_ctxt;
}
/* Post the read */
err = svc_rdma_send(xprt, &read_wr);
if (err) {
- printk(KERN_ERR "svcrdma: Error posting send = %d\n",
+ printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
err);
- /*
- * Break the circular list so free knows when
- * to stop if the error happened to occur on
- * the last read
- */
- ctxt->next = NULL;
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ svc_rdma_put_context(ctxt, 0);
goto out;
}
atomic_inc(&rdma_stat_read);
- if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
- ch_sge_ary[ch_no].count -= read_wr.num_sge;
- ch_sge_ary[ch_no].start += read_wr.num_sge;
+ if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+ chl_map->ch[ch_no].count -= read_wr.num_sge;
+ chl_map->ch[ch_no].start += read_wr.num_sge;
goto next_sge;
}
sgl_offset = 0;
- err = 0;
+ err = 1;
}
out:
- svc_rdma_put_context(tmp_sge_ctxt, 0);
- svc_rdma_put_context(tmp_ch_ctxt, 0);
+ svc_rdma_put_req_map(rpl_map);
+ svc_rdma_put_req_map(chl_map);
/* Detach arg pages. svc_recv will replenish them */
for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
@@ -389,25 +376,12 @@ next_sge:
while (rqstp->rq_resused)
rqstp->rq_respages[--rqstp->rq_resused] = NULL;
- if (err) {
- printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- /* Free the linked list of read contexts */
- while (head != NULL) {
- ctxt = head->next;
- svc_rdma_put_context(head, 1);
- head = ctxt;
- }
- return 0;
- }
-
- return 1;
+ return err;
}
static int rdma_read_complete(struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *data)
+ struct svc_rdma_op_ctxt *head)
{
- struct svc_rdma_op_ctxt *head = data->next;
int page_no;
int ret;
@@ -419,7 +393,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_pages[page_no] = head->pages[page_no];
}
/* Point rq_arg.pages past header */
- rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
+ rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
rqstp->rq_arg.page_len = head->arg.page_len;
rqstp->rq_arg.page_base = head->arg.page_base;
@@ -433,21 +407,12 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.len = head->arg.len;
rqstp->rq_arg.buflen = head->arg.buflen;
+ /* Free the context */
+ svc_rdma_put_context(head, 0);
+
/* XXX: What should this be? */
rqstp->rq_prot = IPPROTO_MAX;
-
- /*
- * Free the contexts we used to build the RDMA_READ. We have
- * to be careful here because the context list uses the same
- * next pointer used to chain the contexts associated with the
- * RDMA_READ
- */
- data->next = NULL; /* terminate circular list */
- do {
- data = head->next;
- svc_rdma_put_context(head, 0);
- head = data;
- } while (head != NULL);
+ svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
ret = rqstp->rq_arg.head[0].iov_len
+ rqstp->rq_arg.page_len
@@ -457,8 +422,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
rqstp->rq_arg.head[0].iov_len);
- /* Indicate that we've consumed an RQ credit */
- rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
svc_xprt_received(rqstp->rq_xprt);
return ret;
}
@@ -480,13 +443,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dprintk("svcrdma: rqstp=%p\n", rqstp);
- /*
- * The rq_xprt_ctxt indicates if we've consumed an RQ credit
- * or not. It is used in the rdma xpo_release_rqst function to
- * determine whether or not to return an RQ WQE to the RQ.
- */
- rqstp->rq_xprt_ctxt = NULL;
-
spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
@@ -537,21 +493,22 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
/* If the request is invalid, reply with an error */
if (len < 0) {
if (len == -ENOSYS)
- (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+ svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
goto close_out;
}
- /* Read read-list data. If we would need to wait, defer
- * it. Not that in this case, we don't return the RQ credit
- * until after the read completes.
- */
- if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
+ /* Read read-list data. */
+ ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
+ if (ret > 0) {
+ /* read-list posted, defer until data received from client. */
svc_xprt_received(xprt);
return 0;
}
-
- /* Indicate we've consumed an RQ credit */
- rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+ if (ret < 0) {
+ /* Post of read-list failed, free context. */
+ svc_rdma_put_context(ctxt, 1);
+ return 0;
+ }
ret = rqstp->rq_arg.head[0].iov_len
+ rqstp->rq_arg.page_len
@@ -569,11 +526,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
return ret;
close_out:
- if (ctxt) {
+ if (ctxt)
svc_rdma_put_context(ctxt, 1);
- /* Indicate we've consumed an RQ credit */
- rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
- }
dprintk("svcrdma: transport %p is closing\n", xprt);
/*
* Set the close bit and enqueue it. svc_recv will see the
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 981f190..a19b22b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -63,52 +63,44 @@
* SGE[2..sge_count-2] data from xdr->pages[]
* SGE[sge_count-1] data from xdr->tail.
*
+ * The max SGE we need is the length of the XDR / pagesize + one for
+ * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+ * reserves a page for both the request and the reply header, and this
+ * array is only concerned with the reply we are assured that we have
+ * on extra page for the RPCRMDA header.
*/
-static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct ib_sge *sge,
- int *sge_count)
+static void xdr_to_sge(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
{
- /* Max we need is the length of the XDR / pagesize + one for
- * head + one for tail + one for RPCRDMA header
- */
int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
int sge_no;
- u32 byte_count = xdr->len;
u32 sge_bytes;
u32 page_bytes;
- int page_off;
+ u32 page_off;
int page_no;
+ BUG_ON(xdr->len !=
+ (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
/* Skip the first sge, this is for the RPCRDMA header */
sge_no = 1;
/* Head SGE */
- sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
- xdr->head[0].iov_base,
- xdr->head[0].iov_len,
- DMA_TO_DEVICE);
- sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
- byte_count -= sge_bytes;
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
sge_no++;
/* pages SGE */
page_no = 0;
page_bytes = xdr->page_len;
page_off = xdr->page_base;
- while (byte_count && page_bytes) {
- sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
- sge[sge_no].addr =
- ib_dma_map_page(xprt->sc_cm_id->device,
- xdr->pages[page_no], page_off,
- sge_bytes, DMA_TO_DEVICE);
- sge_bytes = min(sge_bytes, page_bytes);
- byte_count -= sge_bytes;
+ while (page_bytes) {
+ vec->sge[sge_no].iov_base =
+ page_address(xdr->pages[page_no]) + page_off;
+ sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
page_bytes -= sge_bytes;
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ vec->sge[sge_no].iov_len = sge_bytes;
sge_no++;
page_no++;
@@ -116,36 +108,24 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
}
/* Tail SGE */
- if (byte_count && xdr->tail[0].iov_len) {
- sge[sge_no].addr =
- ib_dma_map_single(xprt->sc_cm_id->device,
- xdr->tail[0].iov_base,
- xdr->tail[0].iov_len,
- DMA_TO_DEVICE);
- sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
- byte_count -= sge_bytes;
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ if (xdr->tail[0].iov_len) {
+ vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
sge_no++;
}
BUG_ON(sge_no > sge_max);
- BUG_ON(byte_count != 0);
-
- *sge_count = sge_no;
- return sge;
+ vec->count = sge_no;
}
-
/* Assumptions:
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
u32 rmr, u64 to,
u32 xdr_off, int write_len,
- struct ib_sge *xdr_sge, int sge_count)
+ struct svc_rdma_req_map *vec)
{
- struct svc_rdma_op_ctxt *tmp_sge_ctxt;
struct ib_send_wr write_wr;
struct ib_sge *sge;
int xdr_sge_no;
@@ -154,25 +134,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
int sge_off;
int bc;
struct svc_rdma_op_ctxt *ctxt;
- int ret = 0;
- BUG_ON(sge_count > RPCSVC_MAXPAGES);
+ BUG_ON(vec->count > RPCSVC_MAXPAGES);
dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
- "write_len=%d, xdr_sge=%p, sge_count=%d\n",
+ "write_len=%d, vec->sge=%p, vec->count=%lu\n",
rmr, (unsigned long long)to, xdr_off,
- write_len, xdr_sge, sge_count);
+ write_len, vec->sge, vec->count);
ctxt = svc_rdma_get_context(xprt);
- ctxt->count = 0;
- tmp_sge_ctxt = svc_rdma_get_context(xprt);
- sge = tmp_sge_ctxt->sge;
+ ctxt->direction = DMA_TO_DEVICE;
+ sge = ctxt->sge;
/* Find the SGE associated with xdr_off */
- for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
+ for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
xdr_sge_no++) {
- if (xdr_sge[xdr_sge_no].length > bc)
+ if (vec->sge[xdr_sge_no].iov_len > bc)
break;
- bc -= xdr_sge[xdr_sge_no].length;
+ bc -= vec->sge[xdr_sge_no].iov_len;
}
sge_off = bc;
@@ -180,21 +158,28 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge_no = 0;
/* Copy the remaining SGE */
- while (bc != 0 && xdr_sge_no < sge_count) {
- sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
- sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
+ while (bc != 0 && xdr_sge_no < vec->count) {
+ sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
sge_bytes = min((size_t)bc,
- (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
+ (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
sge[sge_no].length = sge_bytes;
-
+ atomic_inc(&xprt->sc_dma_used);
+ sge[sge_no].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ (void *)
+ vec->sge[xdr_sge_no].iov_base + sge_off,
+ sge_bytes, DMA_TO_DEVICE);
+ if (dma_mapping_error(sge[sge_no].addr))
+ goto err;
sge_off = 0;
sge_no++;
+ ctxt->count++;
xdr_sge_no++;
bc -= sge_bytes;
}
BUG_ON(bc != 0);
- BUG_ON(xdr_sge_no > sge_count);
+ BUG_ON(xdr_sge_no > vec->count);
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
@@ -209,21 +194,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
/* Post It */
atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr)) {
- svc_rdma_put_context(ctxt, 1);
- /* Fatal error, close transport */
- ret = -EIO;
- }
- svc_rdma_put_context(tmp_sge_ctxt, 0);
- return ret;
+ if (svc_rdma_send(xprt, &write_wr))
+ goto err;
+ return 0;
+ err:
+ svc_rdma_put_context(ctxt, 0);
+ /* Fatal error, close transport */
+ return -EIO;
}
static int send_write_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rdma_argp,
struct rpcrdma_msg *rdma_resp,
struct svc_rqst *rqstp,
- struct ib_sge *sge,
- int sge_count)
+ struct svc_rdma_req_map *vec)
{
u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
int write_len;
@@ -269,8 +253,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
rs_offset + chunk_off,
xdr_off,
this_write,
- sge,
- sge_count);
+ vec);
if (ret) {
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
ret);
@@ -292,8 +275,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rdma_argp,
struct rpcrdma_msg *rdma_resp,
struct svc_rqst *rqstp,
- struct ib_sge *sge,
- int sge_count)
+ struct svc_rdma_req_map *vec)
{
u32 xfer_len = rqstp->rq_res.len;
int write_len;
@@ -341,8 +323,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
rs_offset + chunk_off,
xdr_off,
this_write,
- sge,
- sge_count);
+ vec);
if (ret) {
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
ret);
@@ -380,7 +361,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct page *page,
struct rpcrdma_msg *rdma_resp,
struct svc_rdma_op_ctxt *ctxt,
- int sge_count,
+ struct svc_rdma_req_map *vec,
int byte_count)
{
struct ib_send_wr send_wr;
@@ -389,11 +370,23 @@ static int send_reply(struct svcxprt_rdma *rdma,
int page_no;
int ret;
+ /* Post a recv buffer to handle another request. */
+ ret = svc_rdma_post_recv(rdma);
+ if (ret) {
+ printk(KERN_INFO
+ "svcrdma: could not post a receive buffer, err=%d."
+ "Closing transport %p.\n", ret, rdma);
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ svc_rdma_put_context(ctxt, 0);
+ return -ENOTCONN;
+ }
+
/* Prepare the context */
ctxt->pages[0] = page;
ctxt->count = 1;
/* Prepare the SGE for the RPCRDMA Header */
+ atomic_inc(&rdma->sc_dma_used);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device,
page, 0, PAGE_SIZE, DMA_TO_DEVICE);
@@ -402,10 +395,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
/* Determine how many of our SGE are to be transmitted */
- for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
- sge_bytes = min((size_t)ctxt->sge[sge_no].length,
- (size_t)byte_count);
+ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+ sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
+ atomic_inc(&rdma->sc_dma_used);
+ ctxt->sge[sge_no].addr =
+ ib_dma_map_single(rdma->sc_cm_id->device,
+ vec->sge[sge_no].iov_base,
+ sge_bytes, DMA_TO_DEVICE);
+ ctxt->sge[sge_no].length = sge_bytes;
+ ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
}
BUG_ON(byte_count != 0);
@@ -417,8 +416,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
+ /* If there are more pages than SGE, terminate SGE list */
+ if (page_no+1 >= sge_no)
+ ctxt->sge[page_no+1].length = 0;
}
-
BUG_ON(sge_no > rdma->sc_max_sge);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
@@ -462,20 +463,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
enum rpcrdma_proc reply_type;
int ret;
int inline_bytes;
- struct ib_sge *sge;
- int sge_count = 0;
struct page *res_page;
struct svc_rdma_op_ctxt *ctxt;
+ struct svc_rdma_req_map *vec;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
/* Get the RDMA request header. */
rdma_argp = xdr_start(&rqstp->rq_arg);
- /* Build an SGE for the XDR */
+ /* Build an req vec for the XDR */
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
- sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
+ vec = svc_rdma_get_req_map();
+ xdr_to_sge(rdma, &rqstp->rq_res, vec);
inline_bytes = rqstp->rq_res.len;
@@ -492,7 +493,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
/* Send any write-chunk data and build resp write-list */
ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
- rqstp, sge, sge_count);
+ rqstp, vec);
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
ret);
@@ -502,7 +503,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
/* Send any reply-list data and update resp reply-list */
ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
- rqstp, sge, sge_count);
+ rqstp, vec);
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
ret);
@@ -510,11 +511,13 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
}
inline_bytes -= ret;
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
inline_bytes);
+ svc_rdma_put_req_map(vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
error:
+ svc_rdma_put_req_map(vec);
svc_rdma_put_context(ctxt, 0);
put_page(res_page);
return ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index af408fc..19ddc38 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -84,67 +84,37 @@ struct svc_xprt_class svc_rdma_class = {
.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
};
-static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
+/* WR context cache. Created in svc_rdma.c */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
- int target;
- int at_least_one = 0;
struct svc_rdma_op_ctxt *ctxt;
- target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
- xprt->sc_ctxt_max);
-
- spin_lock_bh(&xprt->sc_ctxt_lock);
- while (xprt->sc_ctxt_cnt < target) {
- xprt->sc_ctxt_cnt++;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
-
- ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
-
- spin_lock_bh(&xprt->sc_ctxt_lock);
- if (ctxt) {
- at_least_one = 1;
- ctxt->next = xprt->sc_ctxt_head;
- xprt->sc_ctxt_head = ctxt;
- } else {
- /* kmalloc failed...give up for now */
- xprt->sc_ctxt_cnt--;
+ while (1) {
+ ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+ if (ctxt)
break;
- }
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
}
- spin_unlock_bh(&xprt->sc_ctxt_lock);
- dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
- xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
- return at_least_one;
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt->count = 0;
+ atomic_inc(&xprt->sc_ctxt_used);
+ return ctxt;
}
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
- struct svc_rdma_op_ctxt *ctxt;
-
- while (1) {
- spin_lock_bh(&xprt->sc_ctxt_lock);
- if (unlikely(xprt->sc_ctxt_head == NULL)) {
- /* Try to bump my cache. */
- spin_unlock_bh(&xprt->sc_ctxt_lock);
-
- if (rdma_bump_context_cache(xprt))
- continue;
-
- printk(KERN_INFO "svcrdma: sleeping waiting for "
- "context memory on xprt=%p\n",
- xprt);
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- continue;
- }
- ctxt = xprt->sc_ctxt_head;
- xprt->sc_ctxt_head = ctxt->next;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
- ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->dto_q);
- ctxt->count = 0;
- break;
+ struct svcxprt_rdma *xprt = ctxt->xprt;
+ int i;
+ for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_single(xprt->sc_cm_id->device,
+ ctxt->sge[i].addr,
+ ctxt->sge[i].length,
+ ctxt->direction);
}
- return ctxt;
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -158,15 +128,34 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- for (i = 0; i < ctxt->count; i++)
- dma_unmap_single(xprt->sc_cm_id->device->dma_device,
- ctxt->sge[i].addr,
- ctxt->sge[i].length,
- ctxt->direction);
- spin_lock_bh(&xprt->sc_ctxt_lock);
- ctxt->next = xprt->sc_ctxt_head;
- xprt->sc_ctxt_head = ctxt;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
+ atomic_dec(&xprt->sc_ctxt_used);
+}
+
+/* Temporary NFS request map cache. Created in svc_rdma.c */
+extern struct kmem_cache *svc_rdma_map_cachep;
+
+/*
+ * Temporary NFS req mappings are shared across all transport
+ * instances. These are short lived and should be bounded by the number
+ * of concurrent server threads * depth of the SQ.
+ */
+struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+{
+ struct svc_rdma_req_map *map;
+ while (1) {
+ map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
+ if (map)
+ break;
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+ }
+ map->count = 0;
+ return map;
+}
+
+void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+{
+ kmem_cache_free(svc_rdma_map_cachep, map);
}
/* ib_cq event handler */
@@ -228,23 +217,8 @@ static void dto_tasklet_func(unsigned long data)
list_del_init(&xprt->sc_dto_q);
spin_unlock_irqrestore(&dto_lock, flags);
- if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
- ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
- rq_cq_reap(xprt);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- /*
- * If data arrived before established event,
- * don't enqueue. This defers RPC I/O until the
- * RDMA connection is complete.
- */
- if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
- svc_xprt_enqueue(&xprt->sc_xprt);
- }
-
- if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
- ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
- sq_cq_reap(xprt);
- }
+ rq_cq_reap(xprt);
+ sq_cq_reap(xprt);
svc_xprt_put(&xprt->sc_xprt);
spin_lock_irqsave(&dto_lock, flags);
@@ -263,11 +237,15 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
struct svcxprt_rdma *xprt = cq_context;
unsigned long flags;
+ /* Guard against unconditional flush call for destroyed QP */
+ if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+ return;
+
/*
* Set the bit regardless of whether or not it's on the list
* because it may be on the list already due to an SQ
* completion.
- */
+ */
set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
/*
@@ -290,6 +268,8 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
*
* Take all completing WC off the CQE and enqueue the associated DTO
* context on the dto_q for the transport.
+ *
+ * Note that caller must hold a transport reference.
*/
static void rq_cq_reap(struct svcxprt_rdma *xprt)
{
@@ -297,29 +277,48 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
struct ib_wc wc;
struct svc_rdma_op_ctxt *ctxt = NULL;
+ if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
+ return;
+
+ ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
atomic_inc(&rdma_stat_rq_poll);
- spin_lock_bh(&xprt->sc_rq_dto_lock);
while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
ctxt->wc_status = wc.status;
ctxt->byte_len = wc.byte_len;
+ svc_rdma_unmap_dma(ctxt);
if (wc.status != IB_WC_SUCCESS) {
/* Close the transport */
+ dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
svc_rdma_put_context(ctxt, 1);
+ svc_xprt_put(&xprt->sc_xprt);
continue;
}
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ svc_xprt_put(&xprt->sc_xprt);
}
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
if (ctxt)
atomic_inc(&rdma_stat_rq_prod);
+
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ /*
+ * If data arrived before established event,
+ * don't enqueue. This defers RPC I/O until the
+ * RDMA connection is complete.
+ */
+ if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+ svc_xprt_enqueue(&xprt->sc_xprt);
}
/*
* Send Queue Completion Handler - potentially called on interrupt context.
+ *
+ * Note that caller must hold a transport reference.
*/
static void sq_cq_reap(struct svcxprt_rdma *xprt)
{
@@ -328,11 +327,17 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
struct ib_cq *cq = xprt->sc_sq_cq;
int ret;
+
+ if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+ return;
+
+ ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
atomic_inc(&rdma_stat_sq_poll);
while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
xprt = ctxt->xprt;
+ svc_rdma_unmap_dma(ctxt);
if (wc.status != IB_WC_SUCCESS)
/* Close the transport */
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -343,20 +348,25 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
switch (ctxt->wr_op) {
case IB_WR_SEND:
- case IB_WR_RDMA_WRITE:
svc_rdma_put_context(ctxt, 1);
break;
+ case IB_WR_RDMA_WRITE:
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
case IB_WR_RDMA_READ:
if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+ BUG_ON(!read_hdr);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
spin_lock_bh(&xprt->sc_read_complete_lock);
- list_add_tail(&ctxt->dto_q,
+ list_add_tail(&read_hdr->dto_q,
&xprt->sc_read_complete_q);
spin_unlock_bh(&xprt->sc_read_complete_lock);
svc_xprt_enqueue(&xprt->sc_xprt);
}
+ svc_rdma_put_context(ctxt, 0);
break;
default:
@@ -365,6 +375,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
wc.opcode, wc.status);
break;
}
+ svc_xprt_put(&xprt->sc_xprt);
}
if (ctxt)
@@ -376,11 +387,15 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
struct svcxprt_rdma *xprt = cq_context;
unsigned long flags;
+ /* Guard against unconditional flush call for destroyed QP */
+ if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+ return;
+
/*
* Set the bit regardless of whether or not it's on the list
* because it may be on the list already due to an RQ
* completion.
- */
+ */
set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
/*
@@ -398,39 +413,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
tasklet_schedule(&dto_tasklet);
}
-static void create_context_cache(struct svcxprt_rdma *xprt,
- int ctxt_count, int ctxt_bump, int ctxt_max)
-{
- struct svc_rdma_op_ctxt *ctxt;
- int i;
-
- xprt->sc_ctxt_max = ctxt_max;
- xprt->sc_ctxt_bump = ctxt_bump;
- xprt->sc_ctxt_cnt = 0;
- xprt->sc_ctxt_head = NULL;
- for (i = 0; i < ctxt_count; i++) {
- ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
- if (ctxt) {
- ctxt->next = xprt->sc_ctxt_head;
- xprt->sc_ctxt_head = ctxt;
- xprt->sc_ctxt_cnt++;
- }
- }
-}
-
-static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
-{
- struct svc_rdma_op_ctxt *next;
- if (!ctxt)
- return;
-
- do {
- next = ctxt->next;
- kfree(ctxt);
- ctxt = next;
- } while (next);
-}
-
static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
int listener)
{
@@ -447,7 +429,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_read_complete_lock);
- spin_lock_init(&cma_xprt->sc_ctxt_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
cma_xprt->sc_ord = svcrdma_ord;
@@ -456,21 +437,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
cma_xprt->sc_max_requests = svcrdma_max_requests;
cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
atomic_set(&cma_xprt->sc_sq_count, 0);
+ atomic_set(&cma_xprt->sc_ctxt_used, 0);
- if (!listener) {
- int reqs = cma_xprt->sc_max_requests;
- create_context_cache(cma_xprt,
- reqs << 1, /* starting size */
- reqs, /* bump amount */
- reqs +
- cma_xprt->sc_sq_depth +
- RPCRDMA_MAX_THREADS + 1); /* max */
- if (!cma_xprt->sc_ctxt_head) {
- kfree(cma_xprt);
- return NULL;
- }
- clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
- } else
+ if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
return cma_xprt;
@@ -506,6 +475,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
BUG_ON(sge_no >= xprt->sc_max_sge);
page = svc_rdma_get_page();
ctxt->pages[sge_no] = page;
+ atomic_inc(&xprt->sc_dma_used);
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
DMA_FROM_DEVICE);
@@ -520,7 +490,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
recv_wr.num_sge = ctxt->count;
recv_wr.wr_id = (u64)(unsigned long)ctxt;
+ svc_xprt_get(&xprt->sc_xprt);
ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+ if (ret) {
+ svc_xprt_put(&xprt->sc_xprt);
+ svc_rdma_put_context(ctxt, 1);
+ }
return ret;
}
@@ -535,10 +510,11 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
* will call the recvfrom method on the listen xprt which will accept the new
* connection.
*/
-static void handle_connect_req(struct rdma_cm_id *new_cma_id)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
{
struct svcxprt_rdma *listen_xprt = new_cma_id->context;
struct svcxprt_rdma *newxprt;
+ struct sockaddr *sa;
/* Create a new transport */
newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
@@ -551,6 +527,15 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
newxprt, newxprt->sc_cm_id, listen_xprt);
+ /* Save client advertised inbound read limit for use later in accept. */
+ newxprt->sc_ord = client_ird;
+
+ /* Set the local and remote addresses in the transport */
+ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
/*
* Enqueue the new transport on the accept queue of the listening
* transport
@@ -581,7 +566,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
case RDMA_CM_EVENT_CONNECT_REQUEST:
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
"event=%d\n", cma_id, cma_id->context, event->event);
- handle_connect_req(cma_id);
+ handle_connect_req(cma_id,
+ event->param.conn.responder_resources);
break;
case RDMA_CM_EVENT_ESTABLISHED:
@@ -627,6 +613,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
if (xprt) {
set_bit(XPT_CLOSE, &xprt->xpt_flags);
svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
}
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -661,31 +648,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
cma_xprt = rdma_create_xprt(serv, 1);
if (!cma_xprt)
- return ERR_PTR(ENOMEM);
+ return ERR_PTR(-ENOMEM);
xprt = &cma_xprt->sc_xprt;
listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
if (IS_ERR(listen_id)) {
- svc_xprt_put(&cma_xprt->sc_xprt);
- dprintk("svcrdma: rdma_create_id failed = %ld\n",
- PTR_ERR(listen_id));
- return (void *)listen_id;
+ ret = PTR_ERR(listen_id);
+ dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
+ goto err0;
}
+
ret = rdma_bind_addr(listen_id, sa);
if (ret) {
- rdma_destroy_id(listen_id);
- svc_xprt_put(&cma_xprt->sc_xprt);
dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
- return ERR_PTR(ret);
+ goto err1;
}
cma_xprt->sc_cm_id = listen_id;
ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
if (ret) {
- rdma_destroy_id(listen_id);
- svc_xprt_put(&cma_xprt->sc_xprt);
dprintk("svcrdma: rdma_listen failed = %d\n", ret);
- return ERR_PTR(ret);
+ goto err1;
}
/*
@@ -696,6 +679,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
return &cma_xprt->sc_xprt;
+
+ err1:
+ rdma_destroy_id(listen_id);
+ err0:
+ kfree(cma_xprt);
+ return ERR_PTR(ret);
}
/*
@@ -716,7 +705,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct ib_device_attr devattr;
- struct sockaddr *sa;
int ret;
int i;
@@ -753,8 +741,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
(size_t)svcrdma_max_requests);
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
- newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom,
- (size_t)svcrdma_ord);
+ /*
+ * Limit ORD based on client limit, local device limit, and
+ * configured svcrdma limit.
+ */
+ newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+ newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
if (IS_ERR(newxprt->sc_pd)) {
@@ -826,7 +818,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
}
- svc_xprt_get(&newxprt->sc_xprt);
newxprt->sc_qp = newxprt->sc_cm_id->qp;
/* Register all of physical memory */
@@ -850,6 +841,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/* Swap out the handler */
newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+ /*
+ * Arm the CQs for the SQ and RQ before accepting so we can't
+ * miss the first message
+ */
+ ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+ ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+
/* Accept Connection */
set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
memset(&conn_param, 0, sizeof conn_param);
@@ -886,58 +884,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_requests,
newxprt->sc_ord);
- /* Set the local and remote addresses in the transport */
- sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
- svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
- sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
- svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
-
- ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
- ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
return &newxprt->sc_xprt;
errout:
dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
/* Take a reference in case the DTO handler runs */
svc_xprt_get(&newxprt->sc_xprt);
- if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) {
+ if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
ib_destroy_qp(newxprt->sc_qp);
- svc_xprt_put(&newxprt->sc_xprt);
- }
rdma_destroy_id(newxprt->sc_cm_id);
/* This call to put will destroy the transport */
svc_xprt_put(&newxprt->sc_xprt);
return NULL;
}
-/*
- * Post an RQ WQE to the RQ when the rqst is being released. This
- * effectively returns an RQ credit to the client. The rq_xprt_ctxt
- * will be null if the request is deferred due to an RDMA_READ or the
- * transport had no data ready (EAGAIN). Note that an RPC deferred in
- * svc_process will still return the credit, this is because the data
- * is copied and no longer consume a WQE/WC.
- */
static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
{
- int err;
- struct svcxprt_rdma *rdma =
- container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
- if (rqstp->rq_xprt_ctxt) {
- BUG_ON(rqstp->rq_xprt_ctxt != rdma);
- err = svc_rdma_post_recv(rdma);
- if (err)
- dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
- err);
- }
- rqstp->rq_xprt_ctxt = NULL;
}
/*
- * When connected, an svc_xprt has at least three references:
- *
- * - A reference held by the QP. We still hold that here because this
- * code deletes the QP and puts the reference.
+ * When connected, an svc_xprt has at least two references:
*
* - A reference held by the cm_id between the ESTABLISHED and
* DISCONNECTED events. If the remote peer disconnected first, this
@@ -946,7 +912,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
* - A reference held by the svc_recv code that called this function
* as part of close processing.
*
- * At a minimum two references should still be held.
+ * At a minimum one references should still be held.
*/
static void svc_rdma_detach(struct svc_xprt *xprt)
{
@@ -956,23 +922,50 @@ static void svc_rdma_detach(struct svc_xprt *xprt)
/* Disconnect and flush posted WQE */
rdma_disconnect(rdma->sc_cm_id);
-
- /* Destroy the QP if present (not a listener) */
- if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) {
- ib_destroy_qp(rdma->sc_qp);
- svc_xprt_put(xprt);
- }
-
- /* Destroy the CM ID */
- rdma_destroy_id(rdma->sc_cm_id);
}
-static void svc_rdma_free(struct svc_xprt *xprt)
+static void __svc_rdma_free(struct work_struct *work)
{
- struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
+ struct svcxprt_rdma *rdma =
+ container_of(work, struct svcxprt_rdma, sc_work);
dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+
/* We should only be called from kref_put */
- BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0);
+ BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
+
+ /*
+ * Destroy queued, but not processed read completions. Note
+ * that this cleanup has to be done before destroying the
+ * cm_id because the device ptr is needed to unmap the dma in
+ * svc_rdma_put_context.
+ */
+ while (!list_empty(&rdma->sc_read_complete_q)) {
+ struct svc_rdma_op_ctxt *ctxt;
+ ctxt = list_entry(rdma->sc_read_complete_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ svc_rdma_put_context(ctxt, 1);
+ }
+
+ /* Destroy queued, but not processed recv completions */
+ while (!list_empty(&rdma->sc_rq_dto_q)) {
+ struct svc_rdma_op_ctxt *ctxt;
+ ctxt = list_entry(rdma->sc_rq_dto_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ svc_rdma_put_context(ctxt, 1);
+ }
+
+ /* Warn if we leaked a resource or under-referenced */
+ WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
+ WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+
+ /* Destroy the QP if present (not a listener) */
+ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+ ib_destroy_qp(rdma->sc_qp);
+
if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
ib_destroy_cq(rdma->sc_sq_cq);
@@ -985,10 +978,20 @@ static void svc_rdma_free(struct svc_xprt *xprt)
if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
ib_dealloc_pd(rdma->sc_pd);
- destroy_context_cache(rdma->sc_ctxt_head);
+ /* Destroy the CM ID */
+ rdma_destroy_id(rdma->sc_cm_id);
+
kfree(rdma);
}
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+ schedule_work(&rdma->sc_work);
+}
+
static int svc_rdma_has_wspace(struct svc_xprt *xprt)
{
struct svcxprt_rdma *rdma =
@@ -1018,7 +1021,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
int ret;
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
- return 0;
+ return -ENOTCONN;
BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
@@ -1029,7 +1032,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
spin_unlock_bh(&xprt->sc_lock);
atomic_inc(&rdma_stat_sq_starve);
- /* See if we can reap some SQ WR */
+
+ /* See if we can opportunistically reap SQ WR to make room */
sq_cq_reap(xprt);
/* Wait until SQ WR available if SQ still full */
@@ -1041,22 +1045,25 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
continue;
}
/* Bumped used SQ WR count and post */
+ svc_xprt_get(&xprt->sc_xprt);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
if (!ret)
atomic_inc(&xprt->sc_sq_count);
- else
+ else {
+ svc_xprt_put(&xprt->sc_xprt);
dprintk("svcrdma: failed to post SQ WR rc=%d, "
"sc_sq_count=%d, sc_sq_depth=%d\n",
ret, atomic_read(&xprt->sc_sq_count),
xprt->sc_sq_depth);
+ }
spin_unlock_bh(&xprt->sc_lock);
break;
}
return ret;
}
-int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err)
+void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+ enum rpcrdma_errcode err)
{
struct ib_send_wr err_wr;
struct ib_sge sge;
@@ -1073,6 +1080,7 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
/* Prepare SGE for local address */
+ atomic_inc(&xprt->sc_dma_used);
sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
sge.lkey = xprt->sc_phys_mr->lkey;
@@ -1094,9 +1102,8 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
/* Post It */
ret = svc_rdma_send(xprt, &err_wr);
if (ret) {
- dprintk("svcrdma: Error posting send = %d\n", ret);
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
svc_rdma_put_context(ctxt, 1);
}
-
- return ret;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e18cd36..657835f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -169,6 +169,11 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk)
return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
}
+static inline int unix_recvq_full(struct sock const *sk)
+{
+ return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
+}
+
static struct sock *unix_peer_get(struct sock *s)
{
struct sock *peer;
@@ -482,6 +487,8 @@ static int unix_socketpair(struct socket *, struct socket *);
static int unix_accept(struct socket *, struct socket *, int);
static int unix_getname(struct socket *, struct sockaddr *, int *, int);
static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
+static unsigned int unix_datagram_poll(struct file *, struct socket *,
+ poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct kiocb *, struct socket *,
@@ -527,7 +534,7 @@ static const struct proto_ops unix_dgram_ops = {
.socketpair = unix_socketpair,
.accept = sock_no_accept,
.getname = unix_getname,
- .poll = datagram_poll,
+ .poll = unix_datagram_poll,
.ioctl = unix_ioctl,
.listen = sock_no_listen,
.shutdown = unix_shutdown,
@@ -548,7 +555,7 @@ static const struct proto_ops unix_seqpacket_ops = {
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
- .poll = datagram_poll,
+ .poll = unix_datagram_poll,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
@@ -983,8 +990,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo)
sched = !sock_flag(other, SOCK_DEAD) &&
!(other->sk_shutdown & RCV_SHUTDOWN) &&
- (skb_queue_len(&other->sk_receive_queue) >
- other->sk_max_ack_backlog);
+ unix_recvq_full(other);
unix_state_unlock(other);
@@ -1058,8 +1064,7 @@ restart:
if (other->sk_state != TCP_LISTEN)
goto out_unlock;
- if (skb_queue_len(&other->sk_receive_queue) >
- other->sk_max_ack_backlog) {
+ if (unix_recvq_full(other)) {
err = -EAGAIN;
if (!timeo)
goto out_unlock;
@@ -1428,9 +1433,7 @@ restart:
goto out_unlock;
}
- if (unix_peer(other) != sk &&
- (skb_queue_len(&other->sk_receive_queue) >
- other->sk_max_ack_backlog)) {
+ if (unix_peer(other) != sk && unix_recvq_full(other)) {
if (!timeo) {
err = -EAGAIN;
goto out_unlock;
@@ -1991,6 +1994,64 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
return mask;
}
+static unsigned int unix_datagram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk, *peer;
+ unsigned int mask;
+
+ poll_wait(file, sk->sk_sleep, wait);
+
+ peer = unix_peer_get(sk);
+ if (peer) {
+ if (peer != sk) {
+ /*
+ * Writability of a connected socket additionally
+ * depends on the state of the receive queue of the
+ * peer.
+ */
+ poll_wait(file, &unix_sk(peer)->peer_wait, wait);
+ } else {
+ sock_put(peer);
+ peer = NULL;
+ }
+ }
+
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ if (sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* writable? */
+ if (unix_writable(sk) && !(peer && unix_recvq_full(peer)))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ if (peer)
+ sock_put(peer);
+
+ return mask;
+}
#ifdef CONFIG_PROC_FS
static struct sock *first_unix_socket(int *i)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2bdd4dd..fb75f26 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -187,7 +187,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
return genlmsg_end(msg, hdr);
nla_put_failure:
- return genlmsg_cancel(msg, hdr);
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
}
static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
@@ -273,7 +274,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 pid, u32 seq, int flags,
return genlmsg_end(msg, hdr);
nla_put_failure:
- return genlmsg_cancel(msg, hdr);
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
}
static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
@@ -928,7 +930,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
return genlmsg_end(msg, hdr);
nla_put_failure:
- return genlmsg_cancel(msg, hdr);
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
}
static int nl80211_dump_station(struct sk_buff *skb,
@@ -1267,7 +1270,8 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
return genlmsg_end(msg, hdr);
nla_put_failure:
- return genlmsg_cancel(msg, hdr);
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
}
static int nl80211_dump_mpath(struct sk_buff *skb,
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index ac765dd..23a2cc0 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -200,8 +200,8 @@ static struct xfrm_algo_desc aalg_list[] = {
}
},
{
- .name = "hmac(ripemd160)",
- .compat = "ripemd160",
+ .name = "hmac(rmd160)",
+ .compat = "rmd160",
.uinfo = {
.auth = {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index a1b0fbe..b976d9e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -50,19 +50,8 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
switch (type) {
case XFRMA_ALG_AUTH:
- if (!algp->alg_key_len &&
- strcmp(algp->alg_name, "digest_null") != 0)
- return -EINVAL;
- break;
-
case XFRMA_ALG_CRYPT:
- if (!algp->alg_key_len &&
- strcmp(algp->alg_name, "cipher_null") != 0)
- return -EINVAL;
- break;
-
case XFRMA_ALG_COMP:
- /* Zero length keys are legal. */
break;
default: