From 70a851667dbd7aa3f7be9609d6580ca9861230b3 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 16 Sep 2010 19:37:30 +0200
Subject: netfilter: use NFPROTO_IPV4 instead of AF_INET

The field family of xt_target should be NFPROTO_IPV4, though
NFPROTO_IPV4 and AF_INET are the same.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_TPROXY.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index c61294d..21bb2af 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -76,7 +76,7 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
 
 static struct xt_target tproxy_tg_reg __read_mostly = {
 	.name		= "TPROXY",
-	.family		= AF_INET,
+	.family		= NFPROTO_IPV4,
 	.table		= "mangle",
 	.target		= tproxy_tg,
 	.targetsize	= sizeof(struct xt_tproxy_target_info),
-- 
cgit v1.1


From b23909695c33f53df5f1d16696b1aa5b874c1904 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 16 Sep 2010 19:55:03 +0200
Subject: netfilter: nf_conntrack: fix the hash random initializing race

nf_conntrack_alloc() isn't called with nf_conntrack_lock locked, so hash
random initializing code maybe executed more than once on different
CPUs.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df3eedb..4c0ad9b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
 				  u16 zone, unsigned int size, unsigned int rnd)
@@ -574,10 +573,18 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 {
 	struct nf_conn *ct;
 
-	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
-		get_random_bytes(&nf_conntrack_hash_rnd,
-				sizeof(nf_conntrack_hash_rnd));
-		nf_conntrack_hash_rnd_initted = 1;
+	if (unlikely(!nf_conntrack_hash_rnd)) {
+		unsigned int rand;
+
+		/*
+		 * Why not initialize nf_conntrack_rnd in a "init()" function ?
+		 * Because there isn't enough entropy when system initializing,
+		 * and we initialize it as late as possible.
+		 */
+		do {
+			get_random_bytes(&rand, sizeof(rand));
+		} while (!rand);
+		cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
 	}
 
 	/* We don't want any race condition at early drop stage */
-- 
cgit v1.1


From 3575792e005dc9994f15ae72c1c6f401d134177d Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 17 Sep 2010 14:18:16 +0200
Subject: ipvs: extend connection flags to 32 bits

- the sync protocol supports 16 bits only, so bits 0..15 should be
used only for flags that should go to backup server, bits 16 and
above should be allocated for flags not sent to backup.

- use IP_VS_CONN_F_DEST_MASK as mask of connection flags in
destination that can be changed by user space

- allow IP_VS_CONN_F_ONE_PACKET to be set in destination

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_conn.c | 16 ++++++++++------
 net/netfilter/ipvs/ip_vs_core.c | 11 ++++++-----
 net/netfilter/ipvs/ip_vs_ctl.c  |  5 +++--
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b71c69a..9fe1da7 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -505,6 +505,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 static inline void
 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 {
+	unsigned int conn_flags;
+
 	/* if dest is NULL, then return directly */
 	if (!dest)
 		return;
@@ -512,16 +514,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 	/* Increase the refcnt counter of the dest */
 	atomic_inc(&dest->refcnt);
 
+	conn_flags = atomic_read(&dest->conn_flags);
+	if (cp->protocol != IPPROTO_UDP)
+		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
 	/* Bind with the destination and its corresponding transmitter */
-	if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-	    (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+	if (cp->flags & IP_VS_CONN_F_SYNC) {
 		/* if the connection is not template and is created
 		 * by sync, preserve the activity flag.
 		 */
-		cp->flags |= atomic_read(&dest->conn_flags) &
-			     (~IP_VS_CONN_F_INACTIVE);
-	else
-		cp->flags |= atomic_read(&dest->conn_flags);
+		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+	}
+	cp->flags |= conn_flags;
 	cp->dest = dest;
 
 	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0c043b6..319991d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -194,7 +194,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
 	__be16  dport;			/* destination port to forward */
-	__be16  flags;
+	unsigned int flags;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
 
@@ -382,7 +382,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
-	__be16 _ports[2], *pptr, flags;
+	__be16 _ports[2], *pptr;
+	unsigned int flags;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -473,9 +474,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
-		__u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
-				iph.protocol == IPPROTO_UDP)?
-				IP_VS_CONN_F_ONE_PACKET : 0;
+		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+				      iph.protocol == IPPROTO_UDP)?
+				      IP_VS_CONN_F_ONE_PACKET : 0;
 		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
 		ip_vs_service_put(svc);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ca8ec8c..7bd41d2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -765,7 +765,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 
 	/* set the weight and the flags */
 	atomic_set(&dest->weight, udest->weight);
-	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+	conn_flags |= IP_VS_CONN_F_INACTIVE;
 
 	/* check if local node and update the flags */
 #ifdef CONFIG_IP_VS_IPV6
@@ -782,7 +783,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 		}
 
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
-	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
 	} else {
 		/*
-- 
cgit v1.1


From f4bc17cdd205ebaa3807c2aa973719bb5ce6a5b2 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 21 Sep 2010 17:35:41 +0200
Subject: ipvs: netfilter connection tracking changes

	Add more code to IPVS to work with Netfilter connection
tracking and fix some problems.

- Allow IPVS to be compiled without connection tracking as in
2.6.35 and before. This can avoid keeping conntracks for all
IPVS connections because this costs memory. ip_vs_ftp still
depends on connection tracking and NAT as implemented for 2.6.36.

- Add sysctl var "conntrack" to enable connection tracking for
all IPVS connections. For loaded IPVS directors it needs
tuning of nf_conntrack_max limit.

- Add IP_VS_CONN_F_NFCT connection flag to request the connection
to use connection tracking. This allows user space to provide this
flag, for example, in dest->conn_flags. This can be useful to
request connection tracking per real server instead of forcing it
for all connections with the "conntrack" sysctl. This flag is
set currently only by ip_vs_ftp and of course by "conntrack" sysctl.

- Add ip_vs_nfct.c file to hold all connection tracking code,
by this way main code should not depend of netfilter conntrack
support.

- Return back the ip_vs_post_routing handler as in 2.6.35 and use
skb->ipvs_property=1 to allow IPVS to work without connection
tracking

Connection tracking:

- most of the code is already in 2.6.36-rc

- alter conntrack reply tuple for LVS-NAT connections when first packet
from client is forwarded and conntrack state is NEW or RELATED.
Additionally, alter reply for RELATED connections from real server,
again for packet in original direction.

- add IP_VS_XMIT_TUNNEL to confirm conntrack (without altering
reply) for LVS-TUN early because we want to call nf_reset. It is
needed because we add IPIP header and the original conntrack
should be preserved, not destroyed. The transmitted IPIP packets
can reuse same conntrack, so we do not set skb->ipvs_property.

- try to destroy conntrack when the IPVS connection is destroyed.
It is not fatal if conntrack disappears before that, it depends
on the used timers.

Fix problems from long time:

- add skb->ip_summed = CHECKSUM_NONE for the LVS-TUN transmitters

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/Kconfig      |  13 +-
 net/netfilter/ipvs/Makefile     |   5 +-
 net/netfilter/ipvs/ip_vs_conn.c |  13 ++
 net/netfilter/ipvs/ip_vs_core.c |  46 ++++++-
 net/netfilter/ipvs/ip_vs_ctl.c  |  12 ++
 net/netfilter/ipvs/ip_vs_ftp.c  | 146 +-------------------
 net/netfilter/ipvs/ip_vs_nfct.c | 292 ++++++++++++++++++++++++++++++++++++++++
 net/netfilter/ipvs/ip_vs_xmit.c |  98 +++++++-------
 8 files changed, 430 insertions(+), 195 deletions(-)
 create mode 100644 net/netfilter/ipvs/ip_vs_nfct.c

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 46a77d5..af3c9f4 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
 #
 menuconfig IP_VS
 	tristate "IP virtual server support"
-	depends on NET && INET && NETFILTER && NF_CONNTRACK
+	depends on NET && INET && NETFILTER
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
 
 config	IP_VS_FTP
   	tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP && NF_NAT
+        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+	select IP_VS_NFCT
 	---help---
 	  FTP is a protocol that transfers IP address and/or port number in
 	  the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,12 @@ config	IP_VS_FTP
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_NFCT
+	bool "Netfilter connection tracking"
+	depends on NF_CONNTRACK
+	---help---
+	  The Netfilter connection tracking support allows the IPVS
+	  connection state to be exported to the Netfilter framework
+	  for filtering purposes.
+
 endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd..349fe88 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
 
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
 		ip_vs_est.o ip_vs_proto.o 				   \
-		$(ip_vs_proto-objs-y)
+		$(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
 
 
 # IPVS core
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9fe1da7..a970d96 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -721,6 +721,9 @@ static void ip_vs_conn_expire(unsigned long data)
 		if (cp->control)
 			ip_vs_control_del(cp);
 
+		if (cp->flags & IP_VS_CONN_F_NFCT)
+			ip_vs_conn_drop_conntrack(cp);
+
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
@@ -816,6 +819,16 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 	if (unlikely(pp && atomic_read(&pp->appcnt)))
 		ip_vs_bind_app(cp, pp);
 
+	/*
+	 * Allow conntrack to be preserved. By default, conntrack
+	 * is created and destroyed for every packet.
+	 * Sometimes keeping conntrack can be useful for
+	 * IP_VS_CONN_F_ONE_PACKET too.
+	 */
+
+	if (ip_vs_conntrack_enabled())
+		cp->flags |= IP_VS_CONN_F_NFCT;
+
 	/* Hash it in the ip_vs_conn_tab finally */
 	ip_vs_conn_hash(cp);
 
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 319991d..7fbc80d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -537,6 +537,23 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	return NF_DROP;
 }
 
+/*
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
+ * chain and is used to avoid double NAT and confirmation when we do
+ * not want to keep the conntrack structure
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	if (!skb->ipvs_property)
+		return NF_ACCEPT;
+	/* The packet was sent from IPVS, exit this chain */
+	return NF_STOP;
+}
+
 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -695,7 +712,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
-	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		skb->ipvs_property = 1;
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
 	verdict = NF_ACCEPT;
 
 out:
@@ -928,17 +948,19 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 	ip_vs_out_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-	ip_vs_update_conntrack(skb, cp, 0);
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		skb->ipvs_property = 1;
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
 	ip_vs_conn_put(cp);
 
-	skb->ipvs_property = 1;
-
 	LeaveFunction(11);
 	return NF_ACCEPT;
 
 drop:
 	ip_vs_conn_put(cp);
 	kfree_skb(skb);
+	LeaveFunction(11);
 	return NF_STOLEN;
 }
 
@@ -1483,6 +1505,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
+	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	{
+		.hook		= ip_vs_post_routing,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum        = NF_INET_POST_ROUTING,
+		.priority       = NF_IP_PRI_NAT_SRC-1,
+	},
 #ifdef CONFIG_IP_VS_IPV6
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
@@ -1511,6 +1541,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
+	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	{
+		.hook		= ip_vs_post_routing,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum        = NF_INET_POST_ROUTING,
+		.priority       = NF_IP6_PRI_NAT_SRC-1,
+	},
 #endif
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7bd41d2..d2d842f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -88,6 +88,9 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
 int sysctl_ip_vs_expire_quiescent_template = 0;
 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
 int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
 
 
 #ifdef CONFIG_IP_VS_DEBUG
@@ -1580,6 +1583,15 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
+#ifdef CONFIG_IP_VS_NFCT
+	{
+		.procname	= "conntrack",
+		.data		= &sysctl_ip_vs_conntrack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "secure_tcp",
 		.data		= &sysctl_ip_vs_secure_tcp,
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7e9af5b..9cd375f 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,17 +20,6 @@
  *
  * Author:	Wouter Gadeyne
  *
- *
- * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
- * http://www.ssi.bg/~ja/nfct/:
- *
- * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
- *
- * Portions Copyright (C) 2001-2002
- * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
- *
- * Portions Copyright (C) 2003-2008
- * Julian Anastasov
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -58,16 +47,6 @@
 #define SERVER_STRING "227 Entering Passive Mode ("
 #define CLIENT_STRING "PORT "
 
-#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
-			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
-			(T)->dst.protonum
-
-#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
-			&((C)->vaddr.ip), ntohs((C)->vport), \
-			&((C)->daddr.ip), ntohs((C)->dport), \
-			(C)->protocol, (C)->state
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv;
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 {
+	/* We use connection tracking for the command connection */
+	cp->flags |= IP_VS_CONN_F_NFCT;
 	return 0;
 }
 
@@ -149,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 }
 
 /*
- * Called from init_conntrack() as expectfn handler.
- */
-static void
-ip_vs_expect_callback(struct nf_conn *ct,
-		      struct nf_conntrack_expect *exp)
-{
-	struct nf_conntrack_tuple *orig, new_reply;
-	struct ip_vs_conn *cp;
-
-	if (exp->tuple.src.l3num != PF_INET)
-		return;
-
-	/*
-	 * We assume that no NF locks are held before this callback.
-	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
-	 * expectations even if they use wildcard values, now we provide the
-	 * actual values from the newly created original conntrack direction.
-	 * The conntrack is confirmed when packet reaches IPVS hooks.
-	 */
-
-	/* RS->CLIENT */
-	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
-				&orig->src.u3, orig->src.u.tcp.port,
-				&orig->dst.u3, orig->dst.u.tcp.port);
-	if (cp) {
-		/* Change reply CLIENT->RS to CLIENT->VS */
-		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		new_reply.dst.u3 = cp->vaddr;
-		new_reply.dst.u.tcp.port = cp->vport;
-		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-			  ", inout cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		goto alter;
-	}
-
-	/* CLIENT->VS */
-	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
-			       &orig->src.u3, orig->src.u.tcp.port,
-			       &orig->dst.u3, orig->dst.u.tcp.port);
-	if (cp) {
-		/* Change reply VS->CLIENT to RS->CLIENT */
-		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		new_reply.src.u3 = cp->daddr;
-		new_reply.src.u.tcp.port = cp->dport;
-		IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
-		goto alter;
-	}
-
-	IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-		  " - unknown expect\n",
-		  __func__, ct, ct->status, ARG_TUPLE(orig));
-	return;
-
-alter:
-	/* Never alter conntrack for non-NAT conns */
-	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
-		nf_conntrack_alter_reply(ct, &new_reply);
-	ip_vs_conn_put(cp);
-	return;
-}
-
-/*
- * Create NF conntrack expectation with wildcard (optional) source port.
- * Then the default callback function will alter the reply and will confirm
- * the conntrack entry when the first packet comes.
- */
-static void
-ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
-		     struct ip_vs_conn *cp, u_int8_t proto,
-		     const __be16 *port, int from_rs)
-{
-	struct nf_conntrack_expect *exp;
-
-	BUG_ON(!ct || ct == &nf_conntrack_untracked);
-
-	exp = nf_ct_expect_alloc(ct);
-	if (!exp)
-		return;
-
-	if (from_rs)
-		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-				  nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
-				  proto, port, &cp->cport);
-	else
-		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-				  nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
-				  proto, port, &cp->vport);
-
-	exp->expectfn = ip_vs_expect_callback;
-
-	IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
-		  __func__, ct, ARG_TUPLE(&exp->tuple));
-	nf_ct_expect_related(exp);
-	nf_ct_expect_put(exp);
-}
-
-/*
  * Look at outgoing ftp packets to catch the response to a PASV command
  * from the server (inside-to-outside).
  * When we see one, we build a connection entry with the client address,
@@ -335,7 +202,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 					      &cp->caddr, 0,
 					      &cp->vaddr, port,
 					      &from, port,
-					      IP_VS_CONN_F_NO_CPORT,
+					      IP_VS_CONN_F_NO_CPORT |
+					      IP_VS_CONN_F_NFCT,
 					      cp->dest);
 			if (!n_cp)
 				return 0;
@@ -371,8 +239,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 						       start-data, end-start,
 						       buf, buf_len);
 			if (ret)
-				ip_vs_expect_related(skb, ct, n_cp,
-						     IPPROTO_TCP, NULL, 0);
+				ip_vs_nfct_expect_related(skb, ct, n_cp,
+							  IPPROTO_TCP, 0, 0);
 		}
 
 		/*
@@ -487,7 +355,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 				      &to, port,
 				      &cp->vaddr, htons(ntohs(cp->vport)-1),
 				      &cp->daddr, htons(ntohs(cp->dport)-1),
-				      0,
+				      IP_VS_CONN_F_NFCT,
 				      cp->dest);
 		if (!n_cp)
 			return 0;
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 0000000..c038458
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg>		Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com>	Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
+			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+			(T)->dst.protonum
+
+#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
+			&((C)->vaddr.ip), ntohs((C)->vport), \
+			&((C)->daddr.ip), ntohs((C)->dport), \
+			(C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conntrack_tuple new_tuple;
+
+	if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+	    nf_ct_is_dying(ct))
+		return;
+
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return;
+
+	/* Alter reply only in original direction */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return;
+
+	/*
+	 * The connection is not yet in the hashtable, so we update it.
+	 * CIP->VIP will remain the same, so leave the tuple in
+	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+	 * real-server we will see RIP->DIP.
+	 */
+	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	/*
+	 * This will also take care of UDP and other protocols.
+	 */
+	if (outin) {
+		new_tuple.src.u3 = cp->daddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.src.u.tcp.port = cp->dport;
+	} else {
+		new_tuple.dst.u3 = cp->vaddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.dst.u.tcp.port = cp->vport;
+	}
+	IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		  "ctinfo=%d, old reply=" FMT_TUPLE
+		  ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+		  __func__, ct, ct->status, ctinfo,
+		  ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+		  ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+	nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+	struct nf_conntrack_expect *exp)
+{
+	struct nf_conntrack_tuple *orig, new_reply;
+	struct ip_vs_conn *cp;
+
+	if (exp->tuple.src.l3num != PF_INET)
+		return;
+
+	/*
+	 * We assume that no NF locks are held before this callback.
+	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+	 * expectations even if they use wildcard values, now we provide the
+	 * actual values from the newly created original conntrack direction.
+	 * The conntrack is confirmed when packet reaches IPVS hooks.
+	 */
+
+	/* RS->CLIENT */
+	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
+				&orig->src.u3, orig->src.u.tcp.port,
+				&orig->dst.u3, orig->dst.u.tcp.port);
+	if (cp) {
+		/* Change reply CLIENT->RS to CLIENT->VS */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.dst.u3 = cp->vaddr;
+		new_reply.dst.u.tcp.port = cp->vport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+			  ", inout cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	/* CLIENT->VS */
+	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
+			       &orig->src.u3, orig->src.u.tcp.port,
+			       &orig->dst.u3, orig->dst.u.tcp.port);
+	if (cp) {
+		/* Change reply VS->CLIENT to RS->CLIENT */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.src.u3 = cp->daddr;
+		new_reply.src.u.tcp.port = cp->dport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		  " - unknown expect\n",
+		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	return;
+
+alter:
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+		nf_conntrack_alter_reply(ct, &new_reply);
+	ip_vs_conn_put(cp);
+	return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+			       struct ip_vs_conn *cp, u_int8_t proto,
+			       const __be16 port, int from_rs)
+{
+	struct nf_conntrack_expect *exp;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return;
+
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			from_rs ? &cp->daddr : &cp->caddr,
+			from_rs ? &cp->caddr : &cp->vaddr,
+			proto, port ? &port : NULL,
+			from_rs ? &cp->cport : &cp->vport);
+
+	exp->expectfn = ip_vs_nfct_expect_callback;
+
+	IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+		__func__, ct, ARG_TUPLE(&exp->tuple));
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct nf_conntrack_tuple tuple;
+
+	if (!cp->cport)
+		return;
+
+	tuple = (struct nf_conntrack_tuple) {
+		.dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+	tuple.src.u3 = cp->caddr;
+	tuple.src.u.all = cp->cport;
+	tuple.src.l3num = cp->af;
+	tuple.dst.u3 = cp->vaddr;
+	tuple.dst.u.all = cp->vport;
+
+	IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+		" for conn " FMT_CONN "\n",
+		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+	h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+	if (h) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* Show what happens instead of calling nf_ct_kill() */
+		if (del_timer(&ct->timeout)) {
+			IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+			if (ct->timeout.function)
+				ct->timeout.function(ct->timeout.data);
+		} else {
+			IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+		}
+		nf_ct_put(ct);
+	} else {
+		IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+			__func__, ARG_TUPLE(&tuple));
+	}
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 49df6be..8817afa 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -28,7 +28,6 @@
 #include <net/ip6_route.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack.h>
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip_vs.h>
@@ -194,12 +193,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 	dst_release(old_dst);
 }
 
-#define IP_VS_XMIT(pf, skb, rt)				\
+#define IP_VS_XMIT_TUNNEL(skb, cp)				\
+({								\
+	int __ret = NF_ACCEPT;					\
+								\
+	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\
+		__ret = ip_vs_confirm_conntrack(skb, cp);	\
+	if (__ret == NF_ACCEPT) {				\
+		nf_reset(skb);					\
+		(skb)->ip_summed = CHECKSUM_NONE;		\
+	}							\
+	__ret;							\
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp)				\
 do {							\
-	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		(skb)->ipvs_property = 1;		\
+	else						\
+		ip_vs_update_conntrack(skb, cp, 1);	\
 	skb_forward_csum(skb);				\
 	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
-		(rt)->dst.dev, dst_output);		\
+		skb_dst(skb)->dev, dst_output);		\
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp)				\
+do {							\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		(skb)->ipvs_property = 1;		\
+	skb_forward_csum(skb);				\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
+		skb_dst(skb)->dev, dst_output);		\
 } while (0)
 
 
@@ -271,7 +295,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -335,7 +359,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -349,36 +373,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 }
 #endif
 
-void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
-{
-	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
-	struct nf_conntrack_tuple new_tuple;
-
-	if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
-		return;
-
-	/*
-	 * The connection is not yet in the hashtable, so we update it.
-	 * CIP->VIP will remain the same, so leave the tuple in
-	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
-	 * real-server we will see RIP->DIP.
-	 */
-	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-	if (outin)
-		new_tuple.src.u3 = cp->daddr;
-	else
-		new_tuple.dst.u3 = cp->vaddr;
-	/*
-	 * This will also take care of UDP and other protocols.
-	 */
-	if (outin)
-		new_tuple.src.u.tcp.port = cp->dport;
-	else
-		new_tuple.dst.u.tcp.port = cp->vport;
-	nf_conntrack_alter_reply(ct, &new_tuple);
-}
-
 /*
  *      NAT transmitter (only for outside-to-inside nat forwarding)
  *      Not used for related ICMP
@@ -434,8 +428,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
-	ip_vs_update_conntrack(skb, cp, 1);
-
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
 	   MTU problem. */
@@ -443,7 +435,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -451,8 +443,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
   tx_error_icmp:
 	dst_link_failure(skb);
   tx_error:
-	LeaveFunction(10);
 	kfree_skb(skb);
+	LeaveFunction(10);
 	return NF_STOLEN;
   tx_error_put:
 	ip_rt_put(rt);
@@ -512,8 +504,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
-	ip_vs_update_conntrack(skb, cp, 1);
-
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
 	   MTU problem. */
@@ -521,7 +511,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -571,6 +561,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
+	int ret;
 
 	EnterFunction(10);
 
@@ -655,7 +646,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ip_local_out(skb);
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
 
 	LeaveFunction(10);
 
@@ -681,6 +676,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
+	int ret;
 
 	EnterFunction(10);
 
@@ -761,7 +757,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ip6_local_out(skb);
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip6_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
 
 	LeaveFunction(10);
 
@@ -820,7 +820,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -873,7 +873,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -947,7 +947,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
 
 	rc = NF_STOLEN;
 	goto out;
@@ -1022,7 +1022,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
 
 	rc = NF_STOLEN;
 	goto out;
-- 
cgit v1.1


From 8a8030407f55a6aaedb51167c1a2383311fcd707 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 21 Sep 2010 17:38:57 +0200
Subject: ipvs: make rerouting optional with snat_reroute

	Add new sysctl flag "snat_reroute". Recent kernels use
ip_route_me_harder() to route LVS-NAT responses properly by
VIP when there are multiple paths to client. But setups
that do not have alternative default routes can skip this
routing lookup by using snat_reroute=0.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++--------
 net/netfilter/ipvs/ip_vs_ctl.c  |  8 ++++++++
 2 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7fbc80d..06c388b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -929,20 +929,31 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		ip_send_check(ip_hdr(skb));
 	}
 
+	/*
+	 * nf_iterate does not expect change in the skb->dst->dev.
+	 * It looks like it is not fatal to enable this code for hooks
+	 * where our handlers are at the end of the chain list and
+	 * when all next handlers use skb->dst->dev and not outdev.
+	 * It will definitely route properly the inout NAT traffic
+	 * when multiple paths are used.
+	 */
+
 	/* For policy routing, packets originating from this
 	 * machine itself may be routed differently to packets
 	 * passing through.  We want this packet to be routed as
 	 * if it came from this machine itself.  So re-compute
 	 * the routing information.
 	 */
+	if (sysctl_ip_vs_snat_reroute) {
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6) {
-		if (ip6_route_me_harder(skb) != 0)
-			goto drop;
-	} else
+		if (af == AF_INET6) {
+			if (ip6_route_me_harder(skb) != 0)
+				goto drop;
+		} else
 #endif
-		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-			goto drop;
+			if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+				goto drop;
+	}
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 
@@ -991,8 +1002,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 
-			if (related)
+			if (related) {
+				if (sysctl_ip_vs_snat_reroute &&
+					NF_ACCEPT == verdict &&
+					ip6_route_me_harder(skb))
+					verdict = NF_DROP;
 				return verdict;
+			}
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 		}
 	} else
@@ -1000,8 +1016,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 			int related, verdict = ip_vs_out_icmp(skb, &related);
 
-			if (related)
+			if (related) {
+				if (sysctl_ip_vs_snat_reroute &&
+					NF_ACCEPT == verdict &&
+					ip_route_me_harder(skb, RTN_LOCAL))
+					verdict = NF_DROP;
 				return verdict;
+			}
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 		}
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d2d842f..e637cd0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -91,6 +91,7 @@ int sysctl_ip_vs_nat_icmp_send = 0;
 #ifdef CONFIG_IP_VS_NFCT
 int sysctl_ip_vs_conntrack;
 #endif
+int sysctl_ip_vs_snat_reroute = 1;
 
 
 #ifdef CONFIG_IP_VS_DEBUG
@@ -1599,6 +1600,13 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
+	{
+		.procname	= "snat_reroute",
+		.data		= &sysctl_ip_vs_snat_reroute,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #if 0
 	{
 		.procname	= "timeout_established",
-- 
cgit v1.1


From 99f07e91bef34db0fc8b1a224096e97f02dc0d56 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 21 Sep 2010 17:49:20 +0200
Subject: netfilter: save the hash of the tuple in the original direction for
 latter use

Since we don't change the tuple in the original direction, we can save it
in ct->tuplehash[IP_CT_DIR_REPLY].hnode.pprev for __nf_conntrack_confirm()
use.

__hash_conntrack() is split into two steps: hash_conntrack_raw() is used
to get the raw hash, and __hash_bucket() is used to get the bucket id.

In SYN-flood case, early_drop() doesn't need to recompute the hash again.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c | 112 ++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 34 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4c0ad9b..1eacf8d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -67,29 +67,40 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-				  u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
 {
 	unsigned int n;
-	u_int32_t h;
 
 	/* The direction must be ignored, so we hash everything up to the
 	 * destination ports (which is a multiple of 4) and treat the last
 	 * three bytes manually.
 	 */
 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-	h = jhash2((u32 *)tuple, n,
-		   zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
-				 tuple->dst.protonum));
+	return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+		      (((__force __u16)tuple->dst.u.all << 16) |
+		      tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+	return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+	return __hash_bucket(hash, net->ct.htable_size);
+}
 
-	return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+				  u16 zone, unsigned int size)
+{
+	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
 }
 
 static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
 				       const struct nf_conntrack_tuple *tuple)
 {
-	return __hash_conntrack(tuple, zone, net->ct.htable_size,
-				nf_conntrack_hash_rnd);
+	return __hash_conntrack(tuple, zone, net->ct.htable_size);
 }
 
 bool
@@ -291,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
  * OR
  * - Caller must lock nf_conntrack_lock before calling this function
  */
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
-		    const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
-	unsigned int hash = hash_conntrack(net, zone, tuple);
+	unsigned int bucket = hash_bucket(hash, net);
 
 	/* Disable BHs the entire time since we normally need to disable them
 	 * at least once for the stats anyway.
 	 */
 	local_bh_disable();
 begin:
-	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
 		if (nf_ct_tuple_equal(tuple, &h->tuple) &&
 		    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
 			NF_CT_STAT_INC(net, found);
@@ -318,7 +329,7 @@ begin:
 	 * not the expected one, we must restart lookup.
 	 * We probably met an item that was moved to another chain.
 	 */
-	if (get_nulls_value(n) != hash) {
+	if (get_nulls_value(n) != bucket) {
 		NF_CT_STAT_INC(net, search_restart);
 		goto begin;
 	}
@@ -326,19 +337,27 @@ begin:
 
 	return NULL;
 }
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+		    const struct nf_conntrack_tuple *tuple)
+{
+	return ____nf_conntrack_find(net, zone, tuple,
+				     hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 
 /* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
-		      const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+			const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 
 	rcu_read_lock();
 begin:
-	h = __nf_conntrack_find(net, zone, tuple);
+	h = ____nf_conntrack_find(net, zone, tuple, hash);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (unlikely(nf_ct_is_dying(ct) ||
@@ -356,6 +375,14 @@ begin:
 
 	return h;
 }
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	return __nf_conntrack_find_get(net, zone, tuple,
+				       hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -408,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		return NF_ACCEPT;
 
 	zone = nf_ct_zone(ct);
-	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	/* reuse the hash saved before */
+	hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+	hash = hash_bucket(hash, net);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
 	/* We're not in hash table, and we refuse to set up related
 	   connections for unconfirmed conns.  But packet copies and
@@ -566,10 +596,11 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 	return dropped;
 }
 
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
-				   const struct nf_conntrack_tuple *orig,
-				   const struct nf_conntrack_tuple *repl,
-				   gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+		     const struct nf_conntrack_tuple *orig,
+		     const struct nf_conntrack_tuple *repl,
+		     gfp_t gfp, u32 hash)
 {
 	struct nf_conn *ct;
 
@@ -585,6 +616,9 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 			get_random_bytes(&rand, sizeof(rand));
 		} while (!rand);
 		cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+		/* recompute the hash as nf_conntrack_hash_rnd is initialized */
+		hash = hash_conntrack_raw(orig, zone);
 	}
 
 	/* We don't want any race condition at early drop stage */
@@ -592,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 
 	if (nf_conntrack_max &&
 	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
-		unsigned int hash = hash_conntrack(net, zone, orig);
-		if (!early_drop(net, hash)) {
+		if (!early_drop(net, hash_bucket(hash, net))) {
 			atomic_dec(&net->ct.count);
 			if (net_ratelimit())
 				printk(KERN_WARNING
@@ -623,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
-	ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+	/* save hash for reusing when confirming */
+	*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
 	/* Don't set timer yet: wait for confirmation */
 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
 	write_pnet(&ct->ct_net, net);
@@ -650,6 +684,14 @@ out_free:
 	return ERR_PTR(-ENOMEM);
 #endif
 }
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+				   const struct nf_conntrack_tuple *orig,
+				   const struct nf_conntrack_tuple *repl,
+				   gfp_t gfp)
+{
+	return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 
 void nf_conntrack_free(struct nf_conn *ct)
@@ -671,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	       struct nf_conntrack_l3proto *l3proto,
 	       struct nf_conntrack_l4proto *l4proto,
 	       struct sk_buff *skb,
-	       unsigned int dataoff)
+	       unsigned int dataoff, u32 hash)
 {
 	struct nf_conn *ct;
 	struct nf_conn_help *help;
@@ -685,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		return NULL;
 	}
 
-	ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+				  hash);
 	if (IS_ERR(ct)) {
 		pr_debug("Can't allocate conntrack.\n");
 		return (struct nf_conntrack_tuple_hash *)ct;
@@ -762,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+	u32 hash;
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
@@ -771,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* look for tuple match */
-	h = nf_conntrack_find_get(net, zone, &tuple);
+	hash = hash_conntrack_raw(&tuple, zone);
+	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
 		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
-				   skb, dataoff);
+				   skb, dataoff, hash);
 		if (!h)
 			return NULL;
 		if (IS_ERR(h))
@@ -1314,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 			ct = nf_ct_tuplehash_to_ctrack(h);
 			hlist_nulls_del_rcu(&h->hnnode);
 			bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
-						  hashsize,
-						  nf_conntrack_hash_rnd);
+						  hashsize);
 			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
 		}
 	}
-- 
cgit v1.1


From 26c15cfd291f8b4ee40b4bbdf5e3772adfd704f5 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 21 Sep 2010 18:12:30 +0200
Subject: ipvs: changes related to service usecnt

	Change the usage of svc usecnt during command execution:

- we check if svc is registered but we do not need to hold usecnt
reference while under __ip_vs_mutex, only the packet handling needs
it during scheduling

- change __ip_vs_service_get to __ip_vs_service_find and
__ip_vs_svc_fwm_get to __ip_vs_svc_fwm_find because now caller
will increase svc->usecnt

- put common code that calls update_service in __ip_vs_update_dest

- put common code in ip_vs_unlink_service() and use it to unregister
the service

- add comment that svc should not be accessed after ip_vs_del_service
anymore

- all IP_VS_WAIT_WHILE calls are now unified: usecnt > 0

- Properly log the app ports

	As result, some problems are fixed:

- possible use-after-free of svc in ip_vs_genl_set_cmd after
ip_vs_del_service because our usecnt reference does not guarantee that
svc is not freed on refcnt==0, eg. when no dests are moved to trash

- possible usecnt leak in do_ip_vs_set_ctl after ip_vs_del_service
when the service is not freed now, for example, when some
destionations are moved into trash and svc->refcnt remains above 0.
It is harmless because svc is not in hash anymore.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_app.c |   6 +-
 net/netfilter/ipvs/ip_vs_ctl.c | 250 ++++++++++++++++-------------------------
 2 files changed, 102 insertions(+), 154 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index e76f87f..a475ede 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
 		goto out;
 
 	list_add(&inc->a_list, &app->incs_list);
-	IP_VS_DBG(9, "%s application %s:%u registered\n",
-		  pp->name, inc->name, inc->port);
+	IP_VS_DBG(9, "%s App %s:%u registered\n",
+		  pp->name, inc->name, ntohs(inc->port));
 
 	return 0;
 
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
 		pp->unregister_app(inc);
 
 	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-		  pp->name, inc->name, inc->port);
+		  pp->name, inc->name, ntohs(inc->port));
 
 	list_del(&inc->a_list);
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e637cd0..e4ec8f3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -405,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
  *	Get service by {proto,addr,port} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    __be16 vport)
 {
 	unsigned hash;
@@ -420,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    && (svc->port == vport)
 		    && (svc->protocol == protocol)) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -433,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
  *	Get service by {fwmark} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
@@ -444,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
 	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
 		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -463,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+	svc = __ip_vs_service_find(af, protocol, vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -480,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+		svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -488,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+		svc = __ip_vs_service_find(af, protocol, vaddr, 0);
 	}
 
   out:
+	if (svc)
+		atomic_inc(&svc->usecnt);
 	read_unlock(&__ip_vs_svc_lock);
 
 	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -510,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 	dest->svc = svc;
 }
 
-static inline void
+static void
 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 {
 	struct ip_vs_service *svc = dest->svc;
 
 	dest->svc = NULL;
-	if (atomic_dec_and_test(&svc->refcnt))
+	if (atomic_dec_and_test(&svc->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 }
 
 
@@ -762,8 +767,8 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
  *	Update a destination in the given service
  */
 static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+		    struct ip_vs_dest_user_kern *udest, int add)
 {
 	int conn_flags;
 
@@ -818,6 +823,25 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 	dest->u_threshold = udest->u_threshold;
 	dest->l_threshold = udest->l_threshold;
+
+	if (add)
+		ip_vs_new_estimator(&dest->stats);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	if (add) {
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+	}
+
+	/* call the update_service, because server weight may be changed */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 
@@ -865,13 +889,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	atomic_set(&dest->activeconns, 0);
 	atomic_set(&dest->inactconns, 0);
 	atomic_set(&dest->persistconns, 0);
-	atomic_set(&dest->refcnt, 0);
+	atomic_set(&dest->refcnt, 1);
 
 	INIT_LIST_HEAD(&dest->d_list);
 	spin_lock_init(&dest->dst_lock);
 	spin_lock_init(&dest->stats.lock);
-	__ip_vs_update_dest(svc, dest, udest);
-	ip_vs_new_estimator(&dest->stats);
+	__ip_vs_update_dest(svc, dest, udest, 1);
 
 	*dest_p = dest;
 
@@ -931,65 +954,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
 			      ntohs(dest->vport));
 
-		__ip_vs_update_dest(svc, dest, udest);
-
 		/*
 		 * Get the destination from the trash
 		 */
 		list_del(&dest->n_list);
 
-		ip_vs_new_estimator(&dest->stats);
-
-		write_lock_bh(&__ip_vs_svc_lock);
-
+		__ip_vs_update_dest(svc, dest, udest, 1);
+		ret = 0;
+	} else {
 		/*
-		 * Wait until all other svc users go away.
+		 * Allocate and initialize the dest structure
 		 */
-		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-		list_add(&dest->n_list, &svc->destinations);
-		svc->num_dests++;
-
-		/* call the update_service function of its scheduler */
-		if (svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
-
-		write_unlock_bh(&__ip_vs_svc_lock);
-		return 0;
-	}
-
-	/*
-	 * Allocate and initialize the dest structure
-	 */
-	ret = ip_vs_new_dest(svc, udest, &dest);
-	if (ret) {
-		return ret;
+		ret = ip_vs_new_dest(svc, udest, &dest);
 	}
-
-	/*
-	 * Add the dest entry into the list
-	 */
-	atomic_inc(&dest->refcnt);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	list_add(&dest->n_list, &svc->destinations);
-	svc->num_dests++;
-
-	/* call the update_service function of its scheduler */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
 	LeaveFunction(2);
 
-	return 0;
+	return ret;
 }
 
 
@@ -1028,19 +1008,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 		return -ENOENT;
 	}
 
-	__ip_vs_update_dest(svc, dest, udest);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/* Wait until all other svc users go away */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/* call the update_service, because server weight may be changed */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
+	__ip_vs_update_dest(svc, dest, udest, 0);
 	LeaveFunction(2);
 
 	return 0;
@@ -1067,6 +1035,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
 	 *  the destination into the trash.
 	 */
 	if (atomic_dec_and_test(&dest->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port));
 		ip_vs_dst_reset(dest);
 		/* simply decrease svc->refcnt here, let the caller check
 		   and release the service if nobody refers to it.
@@ -1133,7 +1105,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 *	Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 *	Unlink dest from the service
@@ -1190,7 +1162,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	}
 
 	/* I'm the first user of the service */
-	atomic_set(&svc->usecnt, 1);
+	atomic_set(&svc->usecnt, 0);
 	atomic_set(&svc->refcnt, 0);
 
 	svc->af = u->af;
@@ -1284,7 +1256,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	/*
 	 * Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 * Set the flags and timeout value
@@ -1383,21 +1355,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	/*
 	 *    Free the service if nobody refers to it
 	 */
-	if (atomic_read(&svc->refcnt) == 0)
+	if (atomic_read(&svc->refcnt) == 0) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 }
 
 /*
- *	Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
  */
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
 {
-	if (svc == NULL)
-		return -EEXIST;
-
 	/*
 	 * Unhash it from the service table
 	 */
@@ -1408,11 +1382,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 	/*
 	 * Wait until all the svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	__ip_vs_del_service(svc);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+	ip_vs_unlink_service(svc);
 
 	return 0;
 }
@@ -1431,14 +1415,7 @@ static int ip_vs_flush(void)
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1448,14 +1425,7 @@ static int ip_vs_flush(void)
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt,
 					 &ip_vs_svc_fwm_table[idx], f_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -2168,15 +2138,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
+		svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+		svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
 		ret = -ESRCH;
-		goto out_drop_service;
+		goto out_unlock;
 	}
 
 	switch (cmd) {
@@ -2210,10 +2180,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		ret = -EINVAL;
 	}
 
-out_drop_service:
-	if (svc)
-		ip_vs_service_put(svc);
-
   out_unlock:
 	mutex_unlock(&__ip_vs_mutex);
   out_dec:
@@ -2306,10 +2272,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 	int ret = 0;
 
 	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+		svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
 	else
-		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-					  get->port);
+		svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+					   get->port);
 
 	if (svc) {
 		int count = 0;
@@ -2337,7 +2303,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 			}
 			count++;
 		}
-		ip_vs_service_put(svc);
 	} else
 		ret = -ESRCH;
 	return ret;
@@ -2458,15 +2423,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		entry = (struct ip_vs_service_entry *)arg;
 		addr.ip = entry->addr;
 		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+			svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
 		else
-			svc = __ip_vs_service_get(AF_INET, entry->protocol,
-						  &addr, entry->port);
+			svc = __ip_vs_service_find(AF_INET, entry->protocol,
+						   &addr, entry->port);
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
 				ret = -EFAULT;
-			ip_vs_service_put(svc);
 		} else
 			ret = -ESRCH;
 	}
@@ -2733,10 +2697,12 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-				    struct nlattr *nla, int full_entry)
+				    struct nlattr *nla, int full_entry,
+				    struct ip_vs_service **ret_svc)
 {
 	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
 	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+	struct ip_vs_service *svc;
 
 	/* Parse mandatory identifying service fields first */
 	if (nla == NULL ||
@@ -2772,12 +2738,18 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		usvc->fwmark = 0;
 	}
 
+	if (usvc->fwmark)
+		svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+	else
+		svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+					   &usvc->addr, usvc->port);
+	*ret_svc = svc;
+
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
 		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
 			      *nla_netmask;
 		struct ip_vs_flags flags;
-		struct ip_vs_service *svc;
 
 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
@@ -2790,16 +2762,8 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		nla_memcpy(&flags, nla_flags, sizeof(flags));
 
 		/* prefill flags from service if it already exists */
-		if (usvc->fwmark)
-			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-		else
-			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-						  &usvc->addr, usvc->port);
-		if (svc) {
+		if (svc)
 			usvc->flags = svc->flags;
-			ip_vs_service_put(svc);
-		} else
-			usvc->flags = 0;
 
 		/* set new flags from userland */
 		usvc->flags = (usvc->flags & ~flags.mask) |
@@ -2815,17 +2779,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 {
 	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
 	int ret;
 
-	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (usvc.fwmark)
-		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-	else
-		return __ip_vs_service_get(usvc.af, usvc.protocol,
-					   &usvc.addr, usvc.port);
+	ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+	return ret ? ERR_PTR(ret) : svc;
 }
 
 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2916,7 +2874,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 
 nla_put_failure:
 	cb->args[0] = idx;
-	ip_vs_service_put(svc);
 
 out_err:
 	mutex_unlock(&__ip_vs_mutex);
@@ -3129,17 +3086,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	ret = ip_vs_genl_parse_service(&usvc,
 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
-				       need_full_svc);
+				       need_full_svc, &svc);
 	if (ret)
 		goto out;
 
-	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
-	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
 	/* Unless we're adding a new service, the service must already exist */
 	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
 		ret = -ESRCH;
@@ -3173,6 +3123,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		break;
 	case IPVS_CMD_DEL_SERVICE:
 		ret = ip_vs_del_service(svc);
+		/* do not use svc, it can be freed */
 		break;
 	case IPVS_CMD_NEW_DEST:
 		ret = ip_vs_add_dest(svc, &udest);
@@ -3191,8 +3142,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 out:
-	if (svc)
-		ip_vs_service_put(svc);
 	mutex_unlock(&__ip_vs_mutex);
 
 	return ret;
@@ -3238,7 +3187,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 			goto out_err;
 		} else if (svc) {
 			ret = ip_vs_genl_fill_service(msg, svc);
-			ip_vs_service_put(svc);
 			if (ret)
 				goto nla_put_failure;
 		} else {
-- 
cgit v1.1


From bcac0dfab191cb53b3f9b43c8014a34070ed58ff Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 22 Sep 2010 08:35:36 +0200
Subject: netfilter: ctnetlink: missing validation of CTA_EXPECT_ZONE attribute

This patch adds the missing validation of the CTA_EXPECT_ZONE
attribute in the ctnetlink code.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_netlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5bae1cd..37533a3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1733,6 +1733,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
 	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
 	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
 	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING },
+	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
 };
 
 static int
-- 
cgit v1.1


From 8b008faf92ac8f7eeb65e8cd36077601af7c46db Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 22 Sep 2010 08:36:59 +0200
Subject: netfilter: ctnetlink: allow to specify the expectation flags

With this patch, you can specify the expectation flags for user-space
created expectations.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_netlink.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 37533a3..0804e0e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1577,6 +1577,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 
 	NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
 	NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
+	NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
 	helper = rcu_dereference(nfct_help(master)->helper);
 	if (helper)
 		NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
@@ -1734,6 +1735,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
 	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
 	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING },
 	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
+	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
 };
 
 static int
@@ -1933,9 +1935,13 @@ ctnetlink_create_expect(struct net *net, u16 zone,
 		goto out;
 	}
 
+	if (cda[CTA_EXPECT_FLAGS])
+		exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+	else
+		exp->flags = 0;
+
 	exp->class = 0;
 	exp->expectfn = NULL;
-	exp->flags = 0;
 	exp->master = ct;
 	exp->helper = NULL;
 	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
-- 
cgit v1.1


From bc01befdcf3e40979eb518085a075cbf0aacede0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 28 Sep 2010 21:06:34 +0200
Subject: netfilter: ctnetlink: add support for user-space expectation helpers

This patch adds the basic infrastructure to support user-space
expectation helpers via ctnetlink and the netfilter queuing
infrastructure NFQUEUE. Basically, this patch:

* adds NF_CT_EXPECT_USERSPACE flag to identify user-space
  created expectations. I have also added a sanity check in
  __nf_ct_expect_check() to avoid that kernel-space helpers
  may create an expectation if the master conntrack has no
  helper assigned.
* adds some branches to check if the master conntrack helper
  exists, otherwise we skip the code that refers to kernel-space
  helper such as the local expectation list and the expectation
  policy.
* allows to set the timeout for user-space expectations with
  no helper assigned.
* a list of expectations created from user-space that depends
  on ctnetlink (if this module is removed, they are deleted).
* includes USERSPACE in the /proc output for expectations
  that have been created by a user-space helper.

This patch also modifies ctnetlink to skip including the helper
name in the Netlink messages if no kernel-space helper is set
(since no user-space expectation has not kernel-space kernel
assigned).

You can access an example user-space FTP conntrack helper at:
http://people.netfilter.org/pablo/userspace-conntrack-helpers/nf-ftp-helper-userspace-POC.tar.bz

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_expect.c  | 62 +++++++++++++++++++++++++++---------
 net/netfilter/nf_conntrack_netlink.c | 46 ++++++++++++++++----------
 2 files changed, 77 insertions(+), 31 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29cc..b30a1f2 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,20 +38,23 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
 
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
 /* nf_conntrack_expect helper functions */
 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct net *net = nf_ct_exp_net(exp);
 
-	NF_CT_ASSERT(master_help);
 	NF_CT_ASSERT(!timer_pending(&exp->timeout));
 
 	hlist_del_rcu(&exp->hnode);
 	net->ct.expect_count--;
 
 	hlist_del(&exp->lnode);
-	master_help->expecting[exp->class]--;
+	if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+		master_help->expecting[exp->class]--;
+
 	nf_ct_expect_put(exp);
 
 	NF_CT_STAT_INC(net, expect_delete);
@@ -320,16 +323,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 
 	atomic_inc(&exp->use);
 
-	hlist_add_head(&exp->lnode, &master_help->expectations);
-	master_help->expecting[exp->class]++;
+	if (master_help) {
+		hlist_add_head(&exp->lnode, &master_help->expectations);
+		master_help->expecting[exp->class]++;
+	} else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+		hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
 
 	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
 	net->ct.expect_count++;
 
 	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
 		    (unsigned long)exp);
-	p = &master_help->helper->expect_policy[exp->class];
-	exp->timeout.expires = jiffies + p->timeout * HZ;
+	if (master_help) {
+		p = &master_help->helper->expect_policy[exp->class];
+		exp->timeout.expires = jiffies + p->timeout * HZ;
+	}
 	add_timer(&exp->timeout);
 
 	atomic_inc(&exp->use);
@@ -380,7 +388,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 	unsigned int h;
 	int ret = 1;
 
-	if (!master_help->helper) {
+	/* Don't allow expectations created from kernel-space with no helper */
+	if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+	    (!master_help || (master_help && !master_help->helper))) {
 		ret = -ESHUTDOWN;
 		goto out;
 	}
@@ -398,13 +408,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 		}
 	}
 	/* Will be over limit? */
-	p = &master_help->helper->expect_policy[expect->class];
-	if (p->max_expected &&
-	    master_help->expecting[expect->class] >= p->max_expected) {
-		evict_oldest_expect(master, expect);
-		if (master_help->expecting[expect->class] >= p->max_expected) {
-			ret = -EMFILE;
-			goto out;
+	if (master_help) {
+		p = &master_help->helper->expect_policy[expect->class];
+		if (p->max_expected &&
+		    master_help->expecting[expect->class] >= p->max_expected) {
+			evict_oldest_expect(master, expect);
+			if (master_help->expecting[expect->class]
+						>= p->max_expected) {
+				ret = -EMFILE;
+				goto out;
+			}
 		}
 	}
 
@@ -439,6 +452,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
 
+void nf_ct_remove_userspace_expectations(void)
+{
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+
+	hlist_for_each_entry_safe(exp, n, next,
+				  &nf_ct_userspace_expect_list, lnode) {
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect(exp);
+			nf_ct_expect_put(exp);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
 #ifdef CONFIG_PROC_FS
 struct ct_expect_iter_state {
 	struct seq_net_private p;
@@ -529,8 +557,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
 		seq_printf(s, "PERMANENT");
 		delim = ",";
 	}
-	if (expect->flags & NF_CT_EXPECT_INACTIVE)
+	if (expect->flags & NF_CT_EXPECT_INACTIVE) {
 		seq_printf(s, "%sINACTIVE", delim);
+		delim = ",";
+	}
+	if (expect->flags & NF_CT_EXPECT_USERSPACE)
+		seq_printf(s, "%sUSERSPACE", delim);
 
 	helper = rcu_dereference(nfct_help(expect->master)->helper);
 	if (helper) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 0804e0e..b4077be 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1560,8 +1560,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 			  const struct nf_conntrack_expect *exp)
 {
 	struct nf_conn *master = exp->master;
-	struct nf_conntrack_helper *helper;
 	long timeout = (exp->timeout.expires - jiffies) / HZ;
+	struct nf_conn_help *help;
 
 	if (timeout < 0)
 		timeout = 0;
@@ -1578,9 +1578,14 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 	NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
 	NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
 	NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
-	helper = rcu_dereference(nfct_help(master)->helper);
-	if (helper)
-		NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	help = nfct_help(master);
+	if (help) {
+		struct nf_conntrack_helper *helper;
+
+		helper = rcu_dereference(help->helper);
+		if (helper)
+			NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	}
 
 	return 0;
 
@@ -1921,24 +1926,32 @@ ctnetlink_create_expect(struct net *net, u16 zone,
 	if (!h)
 		return -ENOENT;
 	ct = nf_ct_tuplehash_to_ctrack(h);
-	help = nfct_help(ct);
-
-	if (!help || !help->helper) {
-		/* such conntrack hasn't got any helper, abort */
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
 	exp = nf_ct_expect_alloc(ct);
 	if (!exp) {
 		err = -ENOMEM;
 		goto out;
 	}
+	help = nfct_help(ct);
+	if (!help) {
+		if (!cda[CTA_EXPECT_TIMEOUT]) {
+			err = -EINVAL;
+			goto out;
+		}
+		exp->timeout.expires =
+		  jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
 
-	if (cda[CTA_EXPECT_FLAGS])
-		exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
-	else
-		exp->flags = 0;
+		exp->flags = NF_CT_EXPECT_USERSPACE;
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags |=
+				ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+		}
+	} else {
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+			exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+		} else
+			exp->flags = 0;
+	}
 
 	exp->class = 0;
 	exp->expectfn = NULL;
@@ -2109,6 +2122,7 @@ static void __exit ctnetlink_exit(void)
 {
 	pr_info("ctnetlink: unregistering from nfnetlink.\n");
 
+	nf_ct_remove_userspace_expectations();
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
 	nf_conntrack_unregister_notifier(&ctnl_notifier);
-- 
cgit v1.1


From 5adbb9fb0c35c38022f79e09fecf15ba8f65f069 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:51 +0900
Subject: netfilter: nf_conntrack_sip: Allow ct_sip_get_header() to be called
 with a null ct argument

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/nf_conntrack_sip.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d8922..2fd1ea2 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -152,6 +152,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
 	const char *end;
 	int ret = 0;
 
+	if (!ct)
+		return 0;
+
 	memset(addr, 0, sizeof(*addr));
 	switch (nf_ct_l3num(ct)) {
 	case AF_INET:
-- 
cgit v1.1


From 001985b2c0cfad48e1dec8e30f4d432eac240dd2 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:51 +0900
Subject: netfilter: nf_conntrack_sip: Add callid parser

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/nf_conntrack_sip.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 2fd1ea2..715ce54 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
 	return len;
 }
 
+static int iswordc(const char c)
+{
+	if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+	    (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+	    c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+	    c == '{' || c == '}' || c == '~')
+		return 1;
+	return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+	int len = 0;
+	while (dptr < limit && iswordc(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len, domain_len;
+
+	len = word_len(dptr, limit);
+	dptr += len;
+	if (!len || dptr == limit || *dptr != '@')
+		return len;
+	dptr++;
+	len++;
+
+	domain_len = word_len(dptr, limit);
+	if (!domain_len)
+		return 0;
+	return len + domain_len;
+}
+
 /* get media type + port length */
 static int media_len(const struct nf_conn *ct, const char *dptr,
 		     const char *limit, int *shift)
@@ -299,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
 	[SIP_HDR_VIA_TCP]		= SIP_HDR("Via", "v", "TCP ", epaddr_len),
 	[SIP_HDR_EXPIRES]		= SIP_HDR("Expires", NULL, NULL, digits_len),
 	[SIP_HDR_CONTENT_LENGTH]	= SIP_HDR("Content-Length", "l", NULL, digits_len),
+	[SIP_HDR_CALL_ID]		= SIP_HDR("Call-Id", "i", NULL, callid_len),
 };
 
 static const char *sip_follow_continuation(const char *dptr, const char *limit)
-- 
cgit v1.1


From 5b57a98c1f0d78a4c238d83c4ac70de3bd237b2f Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:51 +0900
Subject: IPVS: compact ip_vs_sched_persist()

Compact ip_vs_sched_persist() by setting up parameters
and calling functions once.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_core.c | 156 +++++++++++++---------------------------
 1 file changed, 51 insertions(+), 105 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 06c388b..70a5cac 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -193,10 +193,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	__be16  dport;			/* destination port to forward */
+	int protocol = iph.protocol;
+	__be16 dport = 0;		/* destination port to forward */
+	__be16 vport = 0;		/* virtual service port */
 	unsigned int flags;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
+	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+	const union nf_inet_addr *vaddr = &iph.daddr;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
@@ -227,119 +231,61 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 	 * is created for other persistent services.
 	 */
-	if (ports[1] == svc->port) {
-		/* Check if a template already exists */
-		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, ports[1]);
-		else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * No template found or the dest of the connection
-			 * template is not available.
-			 */
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
-
-			/*
-			 * Create a template like <protocol,caddr,0,
-			 * vaddr,vport,daddr,dport> for non-ftp service,
-			 * and <protocol,caddr,0,vaddr,0,daddr,0>
-			 * for ftp service.
+	{
+		if (ports[1] == svc->port) {
+			/* non-FTP template:
+			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+			 * FTP template:
+			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 			 */
 			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr,
-						    ports[1],
-						    &dest->addr, dest->port,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
+				vport = ports[1];
 		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
+			/* Note: persistent fwmark-based services and
+			 * persistent port zero service are handled here.
+			 * fwmark template:
+			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+			 * port zero template:
+			 * <protocol,caddr,0,vaddr,0,daddr,0>
+			 */
+			if (svc->fwmark) {
+				protocol = IPPROTO_IP;
+				vaddr = &fwmark;
+			}
 		}
-		dport = dest->port;
-	} else {
-		/*
-		 * Note: persistent fwmark-based services and persistent
-		 * port zero service are handled here.
-		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+	}
+
+	/* Check if a template already exists */
+	ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport);
+
+	if (!ct || !ip_vs_check_template(ct)) {
+		/* No template found or the dest of the connection
+		 * template is not available.
 		 */
-		if (svc->fwmark) {
-			union nf_inet_addr fwmark = {
-				.ip = htonl(svc->fwmark)
-			};
+		dest = svc->scheduler->schedule(svc, skb);
+		if (!dest) {
+			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			return NULL;
+		}
 
-			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-					     &fwmark, 0);
-		} else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
+		if (ports[1] == svc->port && svc->port != FTPPORT)
+			dport = dest->port;
 
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * If it is not persistent port zero, return NULL,
-			 * otherwise create a connection template.
-			 */
-			if (svc->port)
-				return NULL;
+		/* Create a template */
+		ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport,
+				    &dest->addr, dport,
+				    IP_VS_CONN_F_TEMPLATE, dest);
+		if (ct == NULL)
+			return NULL;
 
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
+		ct->timeout = svc->timeout;
+	} else
+		/* set destination with the found template */
+		dest = ct->dest;
 
-			/*
-			 * Create a template according to the service
-			 */
-			if (svc->fwmark) {
-				union nf_inet_addr fwmark = {
-					.ip = htonl(svc->fwmark)
-				};
-
-				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-						    &snet, 0,
-						    &fwmark, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			} else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
-		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
-		}
-		dport = ports[1];
-	}
+	dport = ports[1];
+	if (dport == svc->port && dest->port)
+		dport = dest->port;
 
 	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 		 && iph.protocol == IPPROTO_UDP)?
-- 
cgit v1.1


From f11017ec2d1859c661f4e2b12c4a8d250e1f47cf Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:52 +0900
Subject: IPVS: Add struct ip_vs_conn_param

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_conn.c         | 171 ++++++++++++++++----------------
 net/netfilter/ipvs/ip_vs_core.c         |  64 ++++++------
 net/netfilter/ipvs/ip_vs_ftp.c          |  43 ++++----
 net/netfilter/ipvs/ip_vs_nfct.c         |  12 +--
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |  47 ++++-----
 net/netfilter/ipvs/ip_vs_sync.c         |  33 +++---
 6 files changed, 181 insertions(+), 189 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index a970d96..deeb906 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -218,27 +218,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 /*
  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
- *	s_addr, s_port: pkt source address (foreign host)
- *	d_addr, d_port: pkt dest address (load balancer)
+ *	p->caddr, p->cport: pkt source address (foreign host)
+ *	p->vaddr, p->vport: pkt dest address (load balancer)
  */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
-		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
+		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ct_read_unlock(hash);
@@ -251,71 +250,82 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
 	return NULL;
 }
 
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	struct ip_vs_conn *cp;
 
-	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-					 d_port);
+	cp = __ip_vs_conn_in_get(p);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+		struct ip_vs_conn_param cport_zero_p = *p;
+		cport_zero_p.cport = 0;
+		cp = __ip_vs_conn_in_get(&cport_zero_p);
+	}
 
 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+			    const struct ip_vs_iphdr *iph,
+			    unsigned int proto_off, int inverse,
+			    struct ip_vs_conn_param *p)
+{
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return 1;
+
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+				      &iph->daddr, pptr[1], p);
+	else
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+				      &iph->saddr, pptr[0], p);
+	return 0;
+}
+
 struct ip_vs_conn *
 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
 			struct ip_vs_protocol *pp,
 			const struct ip_vs_iphdr *iph,
 			unsigned int proto_off, int inverse)
 {
-	__be16 _ports[2], *pptr;
+	struct ip_vs_conn_param p;
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
 		return NULL;
 
-	if (likely(!inverse))
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->saddr, pptr[0],
-					 &iph->daddr, pptr[1]);
-	else
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->daddr, pptr[1],
-					 &iph->saddr, pptr[0]);
+	return ip_vs_conn_in_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 
 /* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    /* protocol should only be IPPROTO_IP if
-		     * d_addr is a fwmark */
-		    ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
-		                     d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
+		     * p->vaddr is a fwmark */
+		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+				     p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    protocol == cp->protocol) {
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			goto out;
@@ -327,23 +337,19 @@ struct ip_vs_conn *ip_vs_ct_in_get
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from inside-to-OUTside.
- *	s_addr, s_port: pkt source address (inside host)
- *	d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ *	p->caddr, p->cport: pkt source address (inside host)
+ *	p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp, *ret=NULL;
@@ -351,16 +357,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-		    d_port == cp->cport && s_port == cp->dport &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+		    p->vport == cp->cport && p->cport == cp->dport &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ret = cp;
@@ -371,9 +377,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      ret ? "hit" : "not hit");
 
 	return ret;
@@ -385,20 +391,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
 			 const struct ip_vs_iphdr *iph,
 			 unsigned int proto_off, int inverse)
 {
-	__be16 _ports[2], *pptr;
+	struct ip_vs_conn_param p;
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
 		return NULL;
 
-	if (likely(!inverse))
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->saddr, pptr[0],
-					  &iph->daddr, pptr[1]);
-	else
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->daddr, pptr[1],
-					  &iph->saddr, pptr[0]);
+	return ip_vs_conn_out_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 
@@ -758,13 +756,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
  *	Create a new connection entry and hash it into the ip_vs_conn_tab
  */
 struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-	       const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
 	       struct ip_vs_dest *dest)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
 
 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
@@ -774,14 +771,14 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-	cp->af		   = af;
-	cp->protocol	   = proto;
-	ip_vs_addr_copy(af, &cp->caddr, caddr);
-	cp->cport	   = cport;
-	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-	cp->vport	   = vport;
+	cp->af		   = p->af;
+	cp->protocol	   = p->protocol;
+	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+	cp->cport	   = p->cport;
+	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+	cp->vport	   = p->vport;
 	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
-	ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
@@ -810,7 +807,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
 	/* Bind its packet transmitter */
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
+	if (p->af == AF_INET6)
 		ip_vs_bind_xmit_v6(cp);
 	else
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 70a5cac..87602a6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -193,14 +193,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	int protocol = iph.protocol;
 	__be16 dport = 0;		/* destination port to forward */
-	__be16 vport = 0;		/* virtual service port */
 	unsigned int flags;
+	struct ip_vs_conn_param param;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
-	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
-	const union nf_inet_addr *vaddr = &iph.daddr;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
@@ -232,6 +229,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	 * is created for other persistent services.
 	 */
 	{
+		int protocol = iph.protocol;
+		const union nf_inet_addr *vaddr = &iph.daddr;
+		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+		__be16 vport = 0;
+
 		if (ports[1] == svc->port) {
 			/* non-FTP template:
 			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
@@ -253,11 +255,12 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				vaddr = &fwmark;
 			}
 		}
+		ip_vs_conn_fill_param(svc->af, protocol, &snet, 0,
+				      vaddr, vport, &param);
 	}
 
 	/* Check if a template already exists */
-	ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport);
-
+	ct = ip_vs_ct_in_get(&param);
 	if (!ct || !ip_vs_check_template(ct)) {
 		/* No template found or the dest of the connection
 		 * template is not available.
@@ -272,8 +275,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 			dport = dest->port;
 
 		/* Create a template */
-		ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport,
-				    &dest->addr, dport,
+		ct = ip_vs_conn_new(&param, &dest->addr, dport,
 				    IP_VS_CONN_F_TEMPLATE, dest);
 		if (ct == NULL)
 			return NULL;
@@ -294,12 +296,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/*
 	 *    Create a new connection according to the template
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, ports[0],
-			    &iph.daddr, ports[1],
-			    &dest->addr, dport,
-			    flags,
-			    dest);
+	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+			      &iph.daddr, ports[1], &param);
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
 		return NULL;
@@ -366,14 +365,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	/*
 	 *    Create a connection entry.
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, pptr[0],
-			    &iph.daddr, pptr[1],
-			    &dest->addr, dest->port ? dest->port : pptr[1],
-			    flags,
-			    dest);
-	if (cp == NULL)
-		return NULL;
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+				      pptr[0], &iph.daddr, pptr[1], &p);
+		cp = ip_vs_conn_new(&p, &dest->addr,
+				    dest->port ? dest->port : pptr[1],
+				    flags, dest);
+		if (!cp)
+			return NULL;
+	}
 
 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -429,14 +430,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 
 		/* create a new connection entry */
 		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
-		cp = ip_vs_conn_new(svc->af, iph.protocol,
-				    &iph.saddr, pptr[0],
-				    &iph.daddr, pptr[1],
-				    &daddr, 0,
-				    IP_VS_CONN_F_BYPASS | flags,
-				    NULL);
-		if (cp == NULL)
-			return NF_DROP;
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(svc->af, iph.protocol,
+					      &iph.saddr, pptr[0],
+					      &iph.daddr, pptr[1], &p);
+			cp = ip_vs_conn_new(&p, &daddr, 0,
+					    IP_VS_CONN_F_BYPASS | flags,
+					    NULL);
+			if (!cp)
+				return NF_DROP;
+		}
 
 		/* statistics */
 		ip_vs_in_stats(cp, skb);
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 9cd375f..090889a 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -195,13 +195,17 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		/*
 		 * Now update or create an connection entry for it
 		 */
-		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-					  &cp->caddr, 0);
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, iph->protocol,
+					      &from, port, &cp->caddr, 0, &p);
+			n_cp = ip_vs_conn_out_get(&p);
+		}
 		if (!n_cp) {
-			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-					      &cp->caddr, 0,
-					      &cp->vaddr, port,
-					      &from, port,
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+					      0, &cp->vaddr, port, &p);
+			n_cp = ip_vs_conn_new(&p, &from, port,
 					      IP_VS_CONN_F_NO_CPORT |
 					      IP_VS_CONN_F_NFCT,
 					      cp->dest);
@@ -347,21 +351,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		  ip_vs_proto_name(iph->protocol),
 		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
 
-	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-				 &to, port,
-				 &cp->vaddr, htons(ntohs(cp->vport)-1));
-	if (!n_cp) {
-		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-				      &to, port,
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
 				      &cp->vaddr, htons(ntohs(cp->vport)-1),
-				      &cp->daddr, htons(ntohs(cp->dport)-1),
-				      IP_VS_CONN_F_NFCT,
-				      cp->dest);
-		if (!n_cp)
-			return 0;
+				      &p);
+		n_cp = ip_vs_conn_in_get(&p);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(&p, &cp->daddr,
+					      htons(ntohs(cp->dport)-1),
+					      IP_VS_CONN_F_NFCT, cp->dest);
+			if (!n_cp)
+				return 0;
 
-		/* add its controller */
-		ip_vs_control_add(n_cp, cp);
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
 	}
 
 	/*
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index c038458..4680647 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -140,6 +140,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 {
 	struct nf_conntrack_tuple *orig, new_reply;
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
 	if (exp->tuple.src.l3num != PF_INET)
 		return;
@@ -154,9 +155,10 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 
 	/* RS->CLIENT */
 	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
-				&orig->src.u3, orig->src.u.tcp.port,
-				&orig->dst.u3, orig->dst.u.tcp.port);
+	ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+			      &orig->src.u3, orig->src.u.tcp.port,
+			      &orig->dst.u3, orig->dst.u.tcp.port, &p);
+	cp = ip_vs_conn_out_get(&p);
 	if (cp) {
 		/* Change reply CLIENT->RS to CLIENT->VS */
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
@@ -176,9 +178,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	}
 
 	/* CLIENT->VS */
-	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
-			       &orig->src.u3, orig->src.u.tcp.port,
-			       &orig->dst.u3, orig->dst.u.tcp.port);
+	cp = ip_vs_conn_in_get(&p);
 	if (cp) {
 		/* Change reply VS->CLIENT to RS->CLIENT */
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc..8956ef3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -40,6 +40,19 @@ struct isakmp_hdr {
 
 #define PORT_ISAKMP	500
 
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+			     int inverse, struct ip_vs_conn_param *p)
+{
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->saddr, htons(PORT_ISAKMP),
+				      &iph->daddr, htons(PORT_ISAKMP), p);
+	else
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->daddr, htons(PORT_ISAKMP),
+				      &iph->saddr, htons(PORT_ISAKMP), p);
+}
 
 static struct ip_vs_conn *
 ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		   int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->saddr,
-				       htons(PORT_ISAKMP),
-				       &iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->daddr,
-				       htons(PORT_ISAKMP),
-				       &iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_in_get(&p);
 	if (!cp) {
 		/*
 		 * We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 		    int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->saddr,
-					htons(PORT_ISAKMP),
-					&iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->daddr,
-					htons(PORT_ISAKMP),
-					&iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_out_get(&p);
 	if (!cp) {
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
 			      "%s%s %s->%s\n",
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba0693..f68631f 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -301,6 +301,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_dest *dest;
+	struct ip_vs_conn_param param;
 	char *p;
 	int i;
 
@@ -370,18 +371,17 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 			}
 		}
 
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport);
-		else
-			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-					     (union nf_inet_addr *)&s->caddr,
-					     s->cport,
-					     (union nf_inet_addr *)&s->vaddr,
-					     s->vport);
+		{
+			ip_vs_conn_fill_param(AF_INET, s->protocol,
+					      (union nf_inet_addr *)&s->caddr,
+					      s->cport,
+					      (union nf_inet_addr *)&s->vaddr,
+					      s->vport, &param);
+			if (!(flags & IP_VS_CONN_F_TEMPLATE))
+				cp = ip_vs_conn_in_get(&param);
+			else
+				cp = ip_vs_ct_in_get(&param);
+		}
 		if (!cp) {
 			/*
 			 * Find the appropriate destination for the connection.
@@ -406,14 +406,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 				else
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
-			cp = ip_vs_conn_new(AF_INET, s->protocol,
-					    (union nf_inet_addr *)&s->caddr,
-					    s->cport,
-					    (union nf_inet_addr *)&s->vaddr,
-					    s->vport,
+			cp = ip_vs_conn_new(&param,
 					    (union nf_inet_addr *)&s->daddr,
-					    s->dport,
-					    flags, dest);
+					    s->dport, flags, dest);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 			if (!cp) {
-- 
cgit v1.1


From 6e08bfb879574524cc9a67be960c684989fd986c Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:52 +0900
Subject: IPVS: Allow null argument to ip_vs_scheduler_put()

This simplifies caller logic sightly.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c   | 13 ++++---------
 net/netfilter/ipvs/ip_vs_sched.c |  2 +-
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e4ec8f3..f7afcfe 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1144,7 +1144,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	if (sched == NULL) {
 		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
 		ret = -ENOENT;
-		goto out_mod_dec;
+		goto out_err;
 	}
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1204,7 +1204,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	*svc_p = svc;
 	return 0;
 
-  out_err:
+ out_err:
 	if (svc != NULL) {
 		if (svc->scheduler)
 			ip_vs_unbind_scheduler(svc);
@@ -1217,7 +1217,6 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	}
 	ip_vs_scheduler_put(sched);
 
-  out_mod_dec:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 
@@ -1300,10 +1299,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 #ifdef CONFIG_IP_VS_IPV6
   out:
 #endif
-
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
-
+	ip_vs_scheduler_put(old_sched);
 	return ret;
 }
 
@@ -1327,8 +1323,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	/* Unbind scheduler */
 	old_sched = svc->scheduler;
 	ip_vs_unbind_scheduler(svc);
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
+	ip_vs_scheduler_put(old_sched);
 
 	/* Unbind app inc */
 	if (svc->inc) {
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 727e45b..9f94e32 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -159,7 +159,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
 
 void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 {
-	if (scheduler->module)
+	if (scheduler && scheduler->module)
 		module_put(scheduler->module);
 }
 
-- 
cgit v1.1


From 2fabf35bfcd89445c54cf1e6a5437dd3cf924a92 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:52 +0900
Subject: IPVS: ip_vs_{un,}bind_scheduler NULL arguments

In general NULL arguments aren't passed by the few callers that exist,
so don't test for them.

The exception is to make passing NULL to ip_vs_unbind_scheduler() a noop.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c   |  3 +--
 net/netfilter/ipvs/ip_vs_sched.c | 23 +++--------------------
 2 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index f7afcfe..5f20caf 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1206,8 +1206,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 
  out_err:
 	if (svc != NULL) {
-		if (svc->scheduler)
-			ip_vs_unbind_scheduler(svc);
+		ip_vs_unbind_scheduler(svc);
 		if (svc->inc) {
 			local_bh_disable();
 			ip_vs_app_inc_put(svc->inc);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 9f94e32..076ebe0 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 {
 	int ret;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-	if (scheduler == NULL) {
-		pr_err("%s(): scheduler arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
 	svc->scheduler = scheduler;
 
 	if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
  */
 int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
-	struct ip_vs_scheduler *sched;
+	struct ip_vs_scheduler *sched = svc->scheduler;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
-	sched = svc->scheduler;
-	if (sched == NULL) {
-		pr_err("%s(): svc isn't bound\n", __func__);
-		return -EINVAL;
-	}
+	if (!sched)
+		return 0;
 
 	if (sched->done_service) {
 		if (sched->done_service(svc) != 0) {
-- 
cgit v1.1


From 85999283a21ab2dd37427fdd8c8e8af57223977c Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:53 +0900
Subject: IPVS: Add struct ip_vs_pe

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_conn.c | 67 ++++++++++++++++++++++++++++++++++-------
 net/netfilter/ipvs/ip_vs_core.c | 36 +++++++++++++++++-----
 net/netfilter/ipvs/ip_vs_sync.c | 17 +++++++++--
 3 files changed, 100 insertions(+), 20 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index deeb906..06da21e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
 		& ip_vs_conn_tab_mask;
 }
 
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+					     bool inverse)
+{
+	const union nf_inet_addr *addr;
+	__be16 port;
+
+	if (p->pe && p->pe->hashkey_raw)
+		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+			ip_vs_conn_tab_mask;
+
+	if (likely(!inverse)) {
+		addr = p->caddr;
+		port = p->cport;
+	} else {
+		addr = p->vaddr;
+		port = p->vport;
+	}
+
+	return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+	struct ip_vs_conn_param p;
+
+	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+			      NULL, 0, &p);
+
+	if (cp->dest && cp->dest->svc->pe) {
+		p.pe = cp->dest->svc->pe;
+		p.pe_data = cp->pe_data;
+		p.pe_data_len = cp->pe_data_len;
+	}
+
+	return ip_vs_conn_hashkey_param(&p, false);
+}
 
 /*
  *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 		return 0;
 
 	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -227,7 +263,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
@@ -312,11 +348,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (p->pe && p->pe->ct_match) {
+			if (p->pe->ct_match(p, cp))
+				goto out;
+			continue;
+		}
+
 		if (cp->af == p->af &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    /* protocol should only be IPPROTO_IP if
@@ -325,15 +367,14 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 				     p->af, p->vaddr, &cp->vaddr) &&
 		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    p->protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
+		    p->protocol == cp->protocol)
 			goto out;
-		}
 	}
 	cp = NULL;
 
   out:
+	if (cp)
+		atomic_inc(&cp->refcnt);
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
@@ -357,7 +398,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport);
+	hash = ip_vs_conn_hashkey_param(p, true);
 
 	ct_read_lock(hash);
 
@@ -722,6 +763,7 @@ static void ip_vs_conn_expire(unsigned long data)
 		if (cp->flags & IP_VS_CONN_F_NFCT)
 			ip_vs_conn_drop_conntrack(cp);
 
+		kfree(cp->pe_data);
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
@@ -782,6 +824,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
+	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+		cp->pe_data = p->pe_data;
+		cp->pe_data_len = p->pe_data_len;
+	}
 	spin_lock_init(&cp->lock);
 
 	/*
@@ -832,7 +878,6 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	return cp;
 }
 
-
 /*
  *	/proc/net/ip_vs_conn entries
  */
@@ -848,7 +893,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (pos-- == 0) {
 				seq->private = &ip_vs_conn_tab[idx];
-				return cp;
+			return cp;
 			}
 		}
 		ct_read_unlock_bh(idx);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 87602a6..ab98893 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -176,6 +176,19 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+			      struct sk_buff *skb, int protocol,
+			      const union nf_inet_addr *caddr, __be16 cport,
+			      const union nf_inet_addr *vaddr, __be16 vport,
+			      struct ip_vs_conn_param *p)
+{
+	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+	p->pe = svc->pe;
+	if (p->pe && p->pe->fill_param)
+		return p->pe->fill_param(p, skb);
+	return 0;
+}
 
 /*
  *  IPVS persistent scheduling function
@@ -186,7 +199,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
  */
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
-		    const struct sk_buff *skb,
+		    struct sk_buff *skb,
 		    __be16 ports[2])
 {
 	struct ip_vs_conn *cp = NULL;
@@ -255,8 +268,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				vaddr = &fwmark;
 			}
 		}
-		ip_vs_conn_fill_param(svc->af, protocol, &snet, 0,
-				      vaddr, vport, &param);
+		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+						  vaddr, vport, &param))
+			return NULL;
 	}
 
 	/* Check if a template already exists */
@@ -268,22 +282,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		dest = svc->scheduler->schedule(svc, skb);
 		if (!dest) {
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			kfree(param.pe_data);
 			return NULL;
 		}
 
 		if (ports[1] == svc->port && svc->port != FTPPORT)
 			dport = dest->port;
 
-		/* Create a template */
+		/* Create a template
+		 * This adds param.pe_data to the template,
+		 * and thus param.pe_data will be destroyed
+		 * when the template expires */
 		ct = ip_vs_conn_new(&param, &dest->addr, dport,
 				    IP_VS_CONN_F_TEMPLATE, dest);
-		if (ct == NULL)
+		if (ct == NULL) {
+			kfree(param.pe_data);
 			return NULL;
+		}
 
 		ct->timeout = svc->timeout;
-	} else
+	} else {
 		/* set destination with the found template */
 		dest = ct->dest;
+		kfree(param.pe_data);
+	}
 
 	dport = ports[1];
 	if (dport == svc->port && dest->port)
@@ -322,7 +344,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index f68631f..ab85aed 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 		ip_vs_sync_conn(cp->control);
 }
 
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+			   const union nf_inet_addr *caddr, __be16 cport,
+			   const union nf_inet_addr *vaddr, __be16 vport,
+			   struct ip_vs_conn_param *p)
+{
+	/* XXX: Need to take into account persistence engine */
+	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+	return 0;
+}
 
 /*
  *      Process received multicast message and create the corresponding
@@ -372,11 +382,14 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 		}
 
 		{
-			ip_vs_conn_fill_param(AF_INET, s->protocol,
+			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
 					      (union nf_inet_addr *)&s->caddr,
 					      s->cport,
 					      (union nf_inet_addr *)&s->vaddr,
-					      s->vport, &param);
+					      s->vport, &param)) {
+				pr_err("ip_vs_conn_fill_param_sync failed");
+				return;
+			}
 			if (!(flags & IP_VS_CONN_F_TEMPLATE))
 				cp = ip_vs_conn_in_get(&param);
 			else
-- 
cgit v1.1


From a3c918acd29a96aba3b46bf50136e7953a480d17 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:53 +0900
Subject: IPVS: Add persistence engine data to /proc/net/ip_vs_conn

This shouldn't break compatibility with userspace as the new data
is at the end of the line.

I have confirmed that this doesn't break ipvsadm, the main (only?)
user-space user of this data.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_conn.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 06da21e..4adedef 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -950,30 +950,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 
 	if (v == SEQ_START_TOKEN)
 		seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
 		const struct ip_vs_conn *cp = v;
+		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+		size_t len = 0;
+
+		if (cp->dest && cp->dest->svc->pe &&
+		    cp->dest->svc->pe->show_pe_data) {
+			pe_data[0] = ' ';
+			len = strlen(cp->dest->svc->pe->name);
+			memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+			pe_data[len + 1] = ' ';
+			len += 2;
+			len += cp->dest->svc->pe->show_pe_data(cp,
+							       pe_data + len);
+		}
+		pe_data[len] = '\0';
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
-			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+				"%pI6 %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
 				&cp->daddr.in6, ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 		else
 #endif
 			seq_printf(seq,
 				"%-3s %08X %04X %08X %04X"
-				" %08X %04X %-11s %7lu\n",
+				" %08X %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
 				ntohl(cp->daddr.ip), ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 	}
 	return 0;
 }
-- 
cgit v1.1


From 8be67a6617b3403551fccb67b1c624c659419515 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:54 +0900
Subject: IPVS: management of persistence engine modules

This is based heavily on the scheduler management code

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/Makefile   |   2 +-
 net/netfilter/ipvs/ip_vs_pe.c | 147 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 net/netfilter/ipvs/ip_vs_pe.c

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index 349fe88..4a87bf3 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -14,7 +14,7 @@ ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
 
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
-		ip_vs_est.o ip_vs_proto.o 				   \
+		ip_vs_est.o ip_vs_proto.o ip_vs_pe.o			   \
 		$(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
 
 
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 0000000..3414af7
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+	svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+	svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+	struct ip_vs_pe *pe;
+
+	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+		  pe_name);
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	list_for_each_entry(pe, &ip_vs_pe, n_list) {
+		/* Test and get the modules atomically */
+		if (pe->module &&
+		    !try_module_get(pe->module)) {
+			/* This pe is just deleted */
+			continue;
+		}
+		if (strcmp(pe_name, pe->name)==0) {
+			/* HIT */
+			spin_unlock_bh(&ip_vs_pe_lock);
+			return pe;
+		}
+		if (pe->module)
+			module_put(pe->module);
+	}
+
+	spin_unlock_bh(&ip_vs_pe_lock);
+	return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+	struct ip_vs_pe *pe;
+
+	/* Search for the pe by name */
+	pe = ip_vs_pe_getbyname(name);
+
+	/* If pe not found, load the module and search again */
+	if (!pe) {
+		request_module("ip_vs_pe_%s", name);
+		pe = ip_vs_pe_getbyname(name);
+	}
+
+	return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+	if (pe && pe->module)
+		module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	struct ip_vs_pe *tmp;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	if (!list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		ip_vs_use_count_dec();
+		pr_err("%s(): [%s] pe already linked\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Make sure that the pe with this name doesn't exist
+	 * in the pe list.
+	 */
+	list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+		if (strcmp(tmp->name, pe->name) == 0) {
+			spin_unlock_bh(&ip_vs_pe_lock);
+			ip_vs_use_count_dec();
+			pr_err("%s(): [%s] pe already existed "
+			       "in the system\n", __func__, pe->name);
+			return -EINVAL;
+		}
+	}
+	/* Add it into the d-linked pe list */
+	list_add(&pe->n_list, &ip_vs_pe);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	pr_info("[%s] pe registered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	spin_lock_bh(&ip_vs_pe_lock);
+	if (list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		pr_err("%s(): [%s] pe is not in the list. failed\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Remove it from the d-linked pe list */
+	list_del(&pe->n_list);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	pr_info("[%s] pe unregistered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
-- 
cgit v1.1


From 0d1e71b04a04b6912e50926b9987c1e72facb1f3 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:54 +0900
Subject: IPVS: Allow configuration of persistence engines

Allow the persistence engine of a virtual service to be set, edited
and unset.

This feature only works with the netlink user-space interface.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 57 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5f20caf..a697591 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1134,6 +1134,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 {
 	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
 
 	/* increase the module use count */
@@ -1147,6 +1148,16 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		goto out_err;
 	}
 
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out_err;
+		}
+	}
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
 		ret = -EINVAL;
@@ -1184,6 +1195,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		goto out_err;
 	sched = NULL;
 
+	/* Bind the ct retriever */
+	ip_vs_bind_pe(svc, pe);
+	pe = NULL;
+
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
 		atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1215,6 +1230,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		kfree(svc);
 	}
 	ip_vs_scheduler_put(sched);
+	ip_vs_pe_put(pe);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
@@ -1230,6 +1246,7 @@ static int
 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
 	struct ip_vs_scheduler *sched, *old_sched;
+	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
 	int ret = 0;
 
 	/*
@@ -1242,6 +1259,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	}
 	old_sched = sched;
 
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out;
+		}
+		old_pe = pe;
+	}
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
 		ret = -EINVAL;
@@ -1293,12 +1321,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 		}
 	}
 
+	old_pe = svc->pe;
+	if (pe != old_pe) {
+		ip_vs_unbind_pe(svc);
+		ip_vs_bind_pe(svc, pe);
+	}
+
   out_unlock:
 	write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
   out:
-#endif
 	ip_vs_scheduler_put(old_sched);
+	ip_vs_pe_put(old_pe);
 	return ret;
 }
 
@@ -1312,6 +1345,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 {
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
+	struct ip_vs_pe *old_pe;
+
+	pr_info("%s: enter\n", __func__);
 
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
@@ -1324,6 +1360,11 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	ip_vs_unbind_scheduler(svc);
 	ip_vs_scheduler_put(old_sched);
 
+	/* Unbind persistence engine */
+	old_pe = svc->pe;
+	ip_vs_unbind_pe(svc);
+	ip_vs_pe_put(old_pe);
+
 	/* Unbind app inc */
 	if (svc->inc) {
 		ip_vs_app_inc_put(svc->inc);
@@ -2026,6 +2067,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 				  struct ip_vs_service_user *usvc_compat)
 {
+	memset(usvc, 0, sizeof(*usvc));
+
 	usvc->af		= AF_INET;
 	usvc->protocol		= usvc_compat->protocol;
 	usvc->addr.ip		= usvc_compat->addr;
@@ -2043,6 +2086,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
 				   struct ip_vs_dest_user *udest_compat)
 {
+	memset(udest, 0, sizeof(*udest));
+
 	udest->addr.ip		= udest_compat->addr;
 	udest->port		= udest_compat->port;
 	udest->conn_flags	= udest_compat->conn_flags;
@@ -2539,6 +2584,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
 	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
 					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_PENAME_MAXLEN },
 	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
 					    .len = sizeof(struct ip_vs_flags) },
 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
@@ -2615,6 +2662,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	}
 
 	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	if (svc->pe)
+		NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
 	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2741,11 +2790,12 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
-		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
 			      *nla_netmask;
 		struct ip_vs_flags flags;
 
 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
 		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
 		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2763,6 +2813,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		usvc->flags = (usvc->flags & ~flags.mask) |
 			      (flags.flags & flags.mask);
 		usvc->sched_name = nla_data(nla_sched);
+		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
 		usvc->timeout = nla_get_u32(nla_timeout);
 		usvc->netmask = nla_get_u32(nla_netmask);
 	}
-- 
cgit v1.1


From f71499aa11f884255b69ce6c3b3c398c821591a1 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:54 +0900
Subject: IPVS: Fallback if persistence engine fails

Fall back to normal persistence handling if the persistence
engine fails to recognise a packet.

This way, at least the packet will go somewhere.

It is envisaged that iptables could be used to block packets
such if this is not desired although nf_conntrack_sip would
likely need to be enhanced first.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_conn.c |  6 +++---
 net/netfilter/ipvs/ip_vs_core.c | 10 ++++------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 4adedef..1d1a529 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -154,7 +154,7 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
 	const union nf_inet_addr *addr;
 	__be16 port;
 
-	if (p->pe && p->pe->hashkey_raw)
+	if (p->pe_data && p->pe->hashkey_raw)
 		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
 			ip_vs_conn_tab_mask;
 
@@ -353,7 +353,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (p->pe && p->pe->ct_match) {
+		if (p->pe_data && p->pe->ct_match) {
 			if (p->pe->ct_match(p, cp))
 				goto out;
 			continue;
@@ -956,7 +956,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
 		size_t len = 0;
 
-		if (cp->dest && cp->dest->svc->pe &&
+		if (cp->dest && cp->pe_data &&
 		    cp->dest->svc->pe->show_pe_data) {
 			pe_data[0] = ' ';
 			len = strlen(cp->dest->svc->pe->name);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index ab98893..e5fef7a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -176,7 +176,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
-static inline int
+static inline void
 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 			      struct sk_buff *skb, int protocol,
 			      const union nf_inet_addr *caddr, __be16 cport,
@@ -186,8 +186,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
 	p->pe = svc->pe;
 	if (p->pe && p->pe->fill_param)
-		return p->pe->fill_param(p, skb);
-	return 0;
+		p->pe->fill_param(p, skb);
 }
 
 /*
@@ -268,9 +267,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				vaddr = &fwmark;
 			}
 		}
-		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
-						  vaddr, vport, &param))
-			return NULL;
+		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+					      vaddr, vport, &param);
 	}
 
 	/* Check if a template already exists */
-- 
cgit v1.1


From 758ff03387228824617cef9507e5682488bf9e0c Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sun, 22 Aug 2010 21:37:55 +0900
Subject: IPVS: sip persistence engine

Add the SIP callid as a key for persistence.

This allows multiple connections from the same IP address to be
differentiated on the basis of the callid.

When used in conjunction with the persistence mask, it allows connections
from different  IP addresses to be aggregated on the basis of the callid.

It is envisaged that a persistence mask of 0.0.0.0 will be a useful
setting.  That is, ignore the source IP address when checking for
persistence.

It is envisaged that this option will be used in conjunction with
one-packet scheduling.

This only works with UDP and cannot be made to work with TCP
within the current framework.

Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/Kconfig        |   7 ++
 net/netfilter/ipvs/Makefile       |   3 +
 net/netfilter/ipvs/ip_vs_pe_sip.c | 167 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 177 insertions(+)
 create mode 100644 net/netfilter/ipvs/ip_vs_pe_sip.c

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index af3c9f4..a22dac2 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -256,4 +256,11 @@ config	IP_VS_NFCT
 	  connection state to be exported to the Netfilter framework
 	  for filtering purposes.
 
+config	IP_VS_PE_SIP
+	tristate "SIP persistence engine"
+        depends on IP_VS_PROTO_UDP
+	depends on NF_CONNTRACK_SIP
+	---help---
+	  Allow persistence based on the SIP Call-ID
+
 endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index 4a87bf3..34ee602 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -35,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 0000000..a0539f1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,167 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+				    const char *callid, size_t callid_len,
+				    int *idx)
+{
+	size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+	memcpy(buf + *idx, callid, len);
+	buf[*idx+len] = '\0';
+	*idx += len + 1;
+	return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len)					\
+	ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),		\
+			 callid, len, &ip_vs_dbg_idx)
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+		      unsigned int datalen,
+		      unsigned int *matchoff, unsigned int *matchlen)
+{
+	/* Find callid */
+	while (1) {
+		int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+					    SIP_HDR_CALL_ID, matchoff,
+					    matchlen);
+		if (ret > 0)
+			break;
+		if (!ret)
+			return 0;
+		dataoff += *matchoff;
+	}
+
+	/* Empty callid is useless */
+	if (!*matchlen)
+		return -EINVAL;
+
+	/* Too large is useless */
+	if (*matchlen > IP_VS_PEDATA_MAXLEN)
+		return -EINVAL;
+
+	/* SIP headers are always followed by a line terminator */
+	if (*matchoff + *matchlen == datalen)
+		return -EINVAL;
+
+	/* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+	 * RFC 3261 allows only CRLF, we support both. */
+	if (*(dptr + *matchoff + *matchlen) != '\r' &&
+	    *(dptr + *matchoff + *matchlen) != '\n')
+		return -EINVAL;
+
+	IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+		      IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+		      *matchlen);
+	return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+	struct ip_vs_iphdr iph;
+	unsigned int dataoff, datalen, matchoff, matchlen;
+	const char *dptr;
+
+	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+	/* Only useful with UDP */
+	if (iph.protocol != IPPROTO_UDP)
+		return -EINVAL;
+
+	/* No Data ? */
+	dataoff = iph.len + sizeof(struct udphdr);
+	if (dataoff >= skb->len)
+		return -EINVAL;
+
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+
+	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+		return -EINVAL;
+
+	p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+	if (!p->pe_data)
+		return -ENOMEM;
+
+	/* N.B: pe_data is only set on success,
+	 * this allows fallback to the default persistence logic on failure
+	 */
+	memcpy(p->pe_data, dptr + matchoff, matchlen);
+	p->pe_data_len = matchlen;
+
+	return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+				  struct ip_vs_conn *ct)
+
+{
+	bool ret = 0;
+
+	if (ct->af == p->af &&
+	    ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+	    /* protocol should only be IPPROTO_IP if
+	     * d_addr is a fwmark */
+	    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+			     p->vaddr, &ct->vaddr) &&
+	    ct->vport == p->vport &&
+	    ct->flags & IP_VS_CONN_F_TEMPLATE &&
+	    ct->protocol == p->protocol &&
+	    ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+	    !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+		ret = 1;
+
+	IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+				 u32 initval, bool inverse)
+{
+	return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+	memcpy(buf, cp->pe_data, cp->pe_data_len);
+	return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+	.name =			"sip",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+	.fill_param =		ip_vs_sip_fill_param,
+	.ct_match =		ip_vs_sip_ct_match,
+	.hashkey_raw =		ip_vs_sip_hashkey_raw,
+	.show_pe_data =		ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+	return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+	unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
-- 
cgit v1.1


From e55df53dd62c73185af46fb6ffa7074b05ceefc4 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Mon, 4 Oct 2010 21:00:42 +0200
Subject: netfilter: remove duplicated include

Remove duplicated include.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_ipvs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 7a4d66d..9127a3d 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -16,7 +16,6 @@
 #include <linux/ip_vs.h>
 #include <linux/types.h>
 #include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_ipvs.h>
 #include <net/netfilter/nf_conntrack.h>
 
-- 
cgit v1.1


From f68c53015c5b9aa98ffd87a34009f89bdbbd7160 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Mon, 4 Oct 2010 22:24:12 +0200
Subject: netfilter: unregister nf hooks, matches and targets in the reverse
 order

Since we register nf hooks, matches and targets in order, we'd better
unregister them in the reverse order.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/core.c     |  6 ++----
 net/netfilter/x_tables.c | 12 ++++--------
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d..8f014f2 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
 
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		nf_unregister_hook(&reg[i]);
+	while (n-- > 0)
+		nf_unregister_hook(&reg[n]);
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e34622f..8046350 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
 void
 xt_unregister_targets(struct xt_target *target, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		xt_unregister_target(&target[i]);
+	while (n-- > 0)
+		xt_unregister_target(&target[n]);
 }
 EXPORT_SYMBOL(xt_unregister_targets);
 
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
 void
 xt_unregister_matches(struct xt_match *match, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		xt_unregister_match(&match[i]);
+	while (n-- > 0)
+		xt_unregister_match(&match[n]);
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
-- 
cgit v1.1


From a91fd267e327ca7599654b4e9ed7b62c5adaccee Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Wed, 13 Oct 2010 21:22:35 +0200
Subject: IPVS: ip_vs_dbg_callid() is only needed for debugging

ip_vs_dbg_callid() and IP_VS_DEBUG_CALLID() are only needed
it CONFIG_IP_VS_DEBUG is defined.

This resolves the following build warning when CONFIG_IP_VS_DEBUG is
not defined.

net/netfilter/ipvs/ip_vs_pe_sip.c:11: warning: 'ip_vs_dbg_callid' defined but not used

Reported-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_pe_sip.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index a0539f1..b8b4e96 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -8,6 +8,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <linux/netfilter/nf_conntrack_sip.h>
 
+#ifdef CONFIG_IP_VS_DEBUG
 static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
 				    const char *callid, size_t callid_len,
 				    int *idx)
@@ -22,6 +23,7 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
 #define IP_VS_DEBUG_CALLID(callid, len)					\
 	ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),		\
 			 callid, len, &ip_vs_dbg_idx)
+#endif
 
 static int get_callid(const char *dptr, unsigned int dataoff,
 		      unsigned int datalen,
-- 
cgit v1.1


From ebbf41df4aabb6d506fa18ea8cb4c2b4388a18b9 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Oct 2010 10:19:06 +0200
Subject: netfilter: ctnetlink: add expectation deletion events

This patch allows to listen to events that inform about
expectations destroyed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_expect.c  |  6 ++++--
 net/netfilter/nf_conntrack_netlink.c | 30 +++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index b30a1f2..46e8966 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -41,7 +41,8 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 static HLIST_HEAD(nf_ct_userspace_expect_list);
 
 /* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+				u32 pid, int report)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct net *net = nf_ct_exp_net(exp);
@@ -55,11 +56,12 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 	if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
 		master_help->expecting[exp->class]--;
 
+	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
 	nf_ct_expect_put(exp);
 
 	NF_CT_STAT_INC(net, expect_delete);
 }
-EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
 
 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
 {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b4077be..62bad22 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1632,17 +1632,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct sk_buff *skb;
-	unsigned int type;
+	unsigned int type, group;
 	int flags = 0;
 
-	if (events & (1 << IPEXP_NEW)) {
+	if (events & (1 << IPEXP_DESTROY)) {
+		type = IPCTNL_MSG_EXP_DELETE;
+		group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+	} else if (events & (1 << IPEXP_NEW)) {
 		type = IPCTNL_MSG_EXP_NEW;
 		flags = NLM_F_CREATE|NLM_F_EXCL;
+		group = NFNLGRP_CONNTRACK_EXP_NEW;
 	} else
 		return 0;
 
-	if (!item->report &&
-	    !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
+	if (!item->report && !nfnetlink_has_listeners(net, group))
 		return 0;
 
 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1665,8 +1668,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
-		       item->report, GFP_ATOMIC);
+	nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
 	return 0;
 
 nla_put_failure:
@@ -1849,7 +1851,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		}
 
 		/* after list removal, usage count == 1 */
-		nf_ct_unexpect_related(exp);
+		spin_lock_bh(&nf_conntrack_lock);
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+						   nlmsg_report(nlh));
+			nf_ct_expect_put(exp);
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
 		/* have to put what we 'get' above.
 		 * after this line usage count == 0 */
 		nf_ct_expect_put(exp);
@@ -1866,7 +1874,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 				m_help = nfct_help(exp->master);
 				if (!strcmp(m_help->helper->name, name) &&
 				    del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect(exp);
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
 					nf_ct_expect_put(exp);
 				}
 			}
@@ -1880,7 +1890,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 						  &net->ct.expect_hash[i],
 						  hnode) {
 				if (del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect(exp);
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
 					nf_ct_expect_put(exp);
 				}
 			}
-- 
cgit v1.1


From 714f095f74582764d629785f03b459a3d0503624 Mon Sep 17 00:00:00 2001
From: Hans Schillstrom <hans.schillstrom@ericsson.com>
Date: Tue, 19 Oct 2010 10:38:48 +0200
Subject: ipvs: IPv6 tunnel mode

IPv6 encapsulation uses a bad source address for the tunnel.
i.e. VIP will be used as local-addr and encap. dst addr.
Decapsulation will not accept this.

Example
LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
   (eth0 2003::1:0:1/96)
RS  (ethX 2003::1:0:5/96)

tcpdump
2003::2:0:100 > 2003::1:0:5: IP6 (hlim 63, next-header TCP (6) payload length: 40)  2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312 (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val 1904932 ecr 0,nop,wscale 3], length 0

In Linux IPv6 impl. you can't have a tunnel with an any cast address
receiving packets (I have not tried to interpret RFC 2473)
To have receive capabilities the tunnel must have:
 - Local address set as multicast addr or an unicast addr
 - Remote address set as an unicast addr.
 - Loop back addres or Link local address are not allowed.

This causes us to setup a tunnel in the Real Server with the
LVS as the remote address, here you can't use the VIP address since it's
used inside the tunnel.

Solution
Use outgoing interface IPv6 address (match against the destination).
i.e. use ip6_route_output() to look up the route cache and
then use ipv6_dev_get_saddr(...) to set the source address of the
encapsulated packet.

Additionally, cache the results in new destination
fields: dst_cookie and dst_saddr and properly check the
returned dst from ip6_route_output. We now add xfrm_lookup
call only for the tunneling method where the source address
is a local one.

Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 171 +++++++++++++++++++++-------------------
 1 file changed, 92 insertions(+), 79 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 8817afa..b0bd8af 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -26,6 +26,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -37,26 +38,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
 {
 	struct dst_entry *old_dst;
 
 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
 	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }
 
 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
 	struct dst_entry *dst = dest->dst_cache;
 
 	if (!dst)
 		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
 }
 
 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
 		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
 			struct flowi fl = {
 				.oif = 0,
 				.nl_u = {
@@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 						.tos = rtos, } },
 			};
 
-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 					     &dest->addr.ip);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
 				  &dest->addr.ip,
 				  atomic_read(&rt->dst.__refcnt), rtos);
@@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 					.tos = rtos, } },
 		};
 
-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 				     &cp->daddr.ip);
 			return NULL;
@@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 }
 
 #ifdef CONFIG_IP_VS_IPV6
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		      struct in6_addr *ret_saddr, int do_xfrm)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;
+	struct dst_entry *dst;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
 		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;
 
-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
 				  atomic_read(&rt->dst.__refcnt));
 		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
+					      do_xfrm);
+		if (!dst)
 			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
 	}
 
 	return rt;
@@ -248,6 +268,7 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
 	u8     tos = iph->tos;
@@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (ip_route_output_key(&init_net, &rt, &fl)) {
+	if (ip_route_output_key(net, &rt, &fl)) {
 		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
 			     __func__, &iph->daddr);
 		goto tx_error_icmp;
@@ -313,25 +334,18 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
+	struct dst_entry *dst;
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};
 
 	EnterFunction(10);
 
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
+	if (!dst)
 		goto tx_error_icmp;
-	}
+	rt = (struct rt6_info *) dst;
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;
 
 	/* MTU checking */
@@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;
 
@@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct iphdr  *old_iph = ip_hdr(skb);
 	u8     tos = old_iph->tos;
 	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
@@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
 		goto tx_error_icmp;
 
 	tdev = rt->dst.dev;
@@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;
 
 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
@@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
 	struct net_device *tdev;	/* Device to other host */
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
@@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
 	if (!rt)
 		goto tx_error_icmp;
 
 	tdev = rt->dst.dev;
 
 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
+	if (mtu < IPV6_MIN_MTU) {
 		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
 		goto tx_error;
 	}
 	if (skb_dst(skb))
@@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
 	iph->priority		=	old_iph->priority;
 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+	ipv6_addr_copy(&iph->saddr, &saddr);
 	iph->hop_limit		=	old_iph->hop_limit;
 
 	/* Another hack: avoid icmp_send in ip_fragment */
@@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;
 
 	/* MTU checking */
@@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;
 
@@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
 		goto tx_error_icmp;
 
 	/* MTU checking */
@@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;
 
-- 
cgit v1.1


From d86bef73b4a24e59e7c1f896a72bbf38430ac2c6 Mon Sep 17 00:00:00 2001
From: Eduardo Blanco <ejblanco@google.com>
Date: Tue, 19 Oct 2010 10:26:47 +0100
Subject: Fixed race condition at ip_vs.ko module init.

Lists were initialized after the module was registered.  Multiple ipvsadm
processes at module load triggered a race condition that resulted in a null
pointer dereference in do_ip_vs_get_ctl(). As a result, __ip_vs_mutex
was left locked preventing all further ipvsadm commands.

Signed-off-by: Eduardo J. Blanco <ejblanco@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a697591..0b884d3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3400,6 +3400,16 @@ int __init ip_vs_control_init(void)
 
 	EnterFunction(2);
 
+	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+	}
+	smp_wmb();
+
 	ret = nf_register_sockopt(&ip_vs_sockopts);
 	if (ret) {
 		pr_err("cannot register sockopt.\n");
@@ -3418,15 +3428,6 @@ int __init ip_vs_control_init(void)
 
 	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
 
-	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-	}
-	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
-	}
-
 	ip_vs_new_estimator(&ip_vs_stats);
 
 	/* Hook the defense timer */
-- 
cgit v1.1


From 5bc9068e9d962ca6b8bec3f0eb6f60ab4dee1d04 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:14:31 +0300
Subject: ipvs: fix CHECKSUM_PARTIAL for TCP, UDP

 	Fix CHECKSUM_PARTIAL handling. Tested for IPv4 TCP,
UDP not tested because it needs network card with HW CSUM support.
May be fixes problem where IPVS can not be used in virtual boxes.
Problem appears with DNAT to local address when the local stack
sends reply in CHECKSUM_PARTIAL mode.

 	Fix tcp_dnat_handler and udp_dnat_handler to provide
vaddr and daddr in right order (old and new IP) when calling
tcp_partial_csum_update/udp_partial_csum_update (CHECKSUM_PARTIAL).

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 +++++-----
 net/netfilter/ipvs/ip_vs_proto_udp.c | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 282d24d..318d011 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -101,15 +101,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		tcph->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
+						csum_unfold(tcph->check))));
 	else
 #endif
 	tcph->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
+						csum_unfold(tcph->check))));
 }
 
 
@@ -223,7 +223,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	 *	Adjust TCP checksums
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
 	} else if (!cp->app) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 8553231..f929089 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -102,15 +102,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		uhdr->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
+						csum_unfold(uhdr->check))));
 	else
 #endif
 	uhdr->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
+						csum_unfold(uhdr->check))));
 }
 
 
@@ -229,7 +229,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	 *	Adjust UDP checksums
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+		udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
 	} else if (!cp->app && (udph->check != 0)) {
-- 
cgit v1.1


From 8b27b10f5863a5b63e46304a71aa01463d1efac4 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:17:20 +0300
Subject: ipvs: optimize checksums for apps

 	Avoid full checksum calculation for apps that can provide
info whether csum was broken after payload mangling. For now only
ip_vs_ftp mangles payload and it updates the csum, so the full
recalculation is avoided for all packets.

 	Add CHECKSUM_UNNECESSARY for snat_handler (TCP and UDP).
It is needed to support SNAT from local address for the case
when csum is fully recalculated.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ftp.c       |  7 ++++++-
 net/netfilter/ipvs/ip_vs_proto_tcp.c | 31 +++++++++++++++++++++++++------
 net/netfilter/ipvs/ip_vs_proto_udp.c | 31 +++++++++++++++++++++++++------
 3 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 090889a..7545500 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -242,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
 						       start-data, end-start,
 						       buf, buf_len);
-			if (ret)
+			if (ret) {
 				ip_vs_nfct_expect_related(skb, ct, n_cp,
 							  IPPROTO_TCP, 0, 0);
+				if (skb->ip_summed == CHECKSUM_COMPLETE)
+					skb->ip_summed = CHECKSUM_UNNECESSARY;
+				/* csum is updated */
+				ret = 1;
+			}
 		}
 
 		/*
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 318d011..64dc295 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -120,6 +120,7 @@ tcp_snat_handler(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -134,13 +135,20 @@ tcp_snat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/* Call application helper if needed */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
 	}
 
 	tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +159,13 @@ tcp_snat_handler(struct sk_buff *skb,
 		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
-	} else if (!cp->app) {
+	} else if (!payload_csum) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		tcph->check = 0;
@@ -174,6 +183,7 @@ tcp_snat_handler(struct sk_buff *skb,
 							skb->len - tcphoff,
 							cp->protocol,
 							skb->csum);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, tcph->check,
@@ -190,6 +200,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -204,6 +215,8 @@ tcp_dnat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -212,8 +225,13 @@ tcp_dnat_handler(struct sk_buff *skb,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn and iph ack_seq stuff
 		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
 	}
 
 	tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -226,12 +244,13 @@ tcp_dnat_handler(struct sk_buff *skb,
 		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
-	} else if (!cp->app) {
+	} else if (!payload_csum) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		tcph->check = 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index f929089..9c558c4 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -121,6 +121,7 @@ udp_snat_handler(struct sk_buff *skb,
 	struct udphdr *udph;
 	unsigned int udphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -135,6 +136,8 @@ udp_snat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -142,8 +145,13 @@ udp_snat_handler(struct sk_buff *skb,
 		/*
 		 *	Call application helper if needed
 		 */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
 	}
 
 	udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +164,13 @@ udp_snat_handler(struct sk_buff *skb,
 		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
+	} else if (!payload_csum && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		udph->check = 0;
@@ -181,6 +190,7 @@ udp_snat_handler(struct sk_buff *skb,
 							skb->csum);
 		if (udph->check == 0)
 			udph->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, udph->check,
 			  (char*)&(udph->check) - (char*)udph);
@@ -196,6 +206,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	struct udphdr *udph;
 	unsigned int udphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -210,6 +221,8 @@ udp_dnat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -218,8 +231,13 @@ udp_dnat_handler(struct sk_buff *skb,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn
 		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
 	}
 
 	udph = (void *)skb_network_header(skb) + udphoff;
@@ -232,12 +250,13 @@ udp_dnat_handler(struct sk_buff *skb,
 		udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
+	} else if (!payload_csum && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		udph->check = 0;
-- 
cgit v1.1


From cf356d69db0afef692cd640917bc70f708c27f14 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:21:07 +0300
Subject: ipvs: switch to notrack mode

 	Change skb->ipvs_property semantic. This is preparation
to support ip_vs_out processing in LOCAL_OUT. ipvs_property=1
will be used to avoid expensive lookups for traffic sent by
transmitters. Now when conntrack support is not used we call
ip_vs_notrack method to avoid problems in OUTPUT and
POST_ROUTING hooks instead of exiting POST_ROUTING as before.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 39 ++++-----------------------------------
 net/netfilter/ipvs/ip_vs_xmit.c |  7 +++++--
 2 files changed, 9 insertions(+), 37 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e5fef7a..2224530 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -507,23 +507,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	return NF_DROP;
 }
 
-/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- * chain and is used to avoid double NAT and confirmation when we do
- * not want to keep the conntrack structure
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	if (!skb->ipvs_property)
-		return NF_ACCEPT;
-	/* The packet was sent from IPVS, exit this chain */
-	return NF_STOP;
-}
-
 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -682,8 +665,9 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
+	skb->ipvs_property = 1;
 	if (!(cp->flags & IP_VS_CONN_F_NFCT))
-		skb->ipvs_property = 1;
+		ip_vs_notrack(skb);
 	else
 		ip_vs_update_conntrack(skb, cp, 0);
 	verdict = NF_ACCEPT;
@@ -929,8 +913,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 	ip_vs_out_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+	skb->ipvs_property = 1;
 	if (!(cp->flags & IP_VS_CONN_F_NFCT))
-		skb->ipvs_property = 1;
+		ip_vs_notrack(skb);
 	else
 		ip_vs_update_conntrack(skb, cp, 0);
 	ip_vs_conn_put(cp);
@@ -1496,14 +1481,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP_PRI_NAT_SRC-1,
-	},
 #ifdef CONFIG_IP_VS_IPV6
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
@@ -1532,14 +1509,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
-	{
-		.hook		= ip_vs_post_routing,
-		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP6_PRI_NAT_SRC-1,
-	},
 #endif
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b0bd8af..94b53b4 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -217,6 +217,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 ({								\
 	int __ret = NF_ACCEPT;					\
 								\
+	(skb)->ipvs_property = 1;				\
 	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\
 		__ret = ip_vs_confirm_conntrack(skb, cp);	\
 	if (__ret == NF_ACCEPT) {				\
@@ -228,8 +229,9 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 
 #define IP_VS_XMIT_NAT(pf, skb, cp)				\
 do {							\
+	(skb)->ipvs_property = 1;			\
 	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
-		(skb)->ipvs_property = 1;		\
+		ip_vs_notrack(skb);			\
 	else						\
 		ip_vs_update_conntrack(skb, cp, 1);	\
 	skb_forward_csum(skb);				\
@@ -239,8 +241,9 @@ do {							\
 
 #define IP_VS_XMIT(pf, skb, cp)				\
 do {							\
+	(skb)->ipvs_property = 1;			\
 	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
-		(skb)->ipvs_property = 1;		\
+		ip_vs_notrack(skb);			\
 	skb_forward_csum(skb);				\
 	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
 		skb_dst(skb)->dev, dst_output);		\
-- 
cgit v1.1


From 190ecd27cd7294105e3b26ca71663c7d940acbbb Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:24:37 +0300
Subject: ipvs: do not schedule conns from real servers

 	This patch is needed to avoid scheduling of
packets from local real server when we add ip_vs_in
in LOCAL_OUT hook to support local client.

 	Currently, when ip_vs_in can not find existing
connection it tries to create new one by calling ip_vs_schedule.

 	The default indication from ip_vs_schedule was if
connection was scheduled to real server. If real server is
not available we try to use the bypass forwarding method
or to send ICMP error. But in some cases we do not want to use
the bypass feature. So, add flag 'ignored' to indicate if
the scheduler ignores this packet.

 	Make sure we do not create new connections from replies.
We can hit this problem for persistent services and local real
server when ip_vs_in is added to LOCAL_OUT hook to handle
local clients.

 	Also, make sure ip_vs_schedule ignores SYN packets
for Active FTP DATA from local real server. The FTP DATA
connection should be created on SYN+ACK from client to assign
correct connection daddr.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c       | 34 ++++++++++++++++++++++++++++++++--
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  6 ++++--
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |  7 +++++--
 net/netfilter/ipvs/ip_vs_proto_udp.c  |  6 ++++--
 4 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2224530..0090d6d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -342,7 +342,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+	       struct ip_vs_protocol *pp, int *ignored)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
@@ -350,16 +351,43 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
 	__be16 _ports[2], *pptr;
 	unsigned int flags;
 
+	*ignored = 1;
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return NULL;
 
 	/*
+	 * FTPDATA needs this check when using local real server.
+	 * Never schedule Active FTPDATA connections from real server.
+	 * For LVS-NAT they must be already created. For other methods
+	 * with persistence the connection is created on SYN+ACK.
+	 */
+	if (pptr[0] == FTPDATA) {
+		IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA");
+		return NULL;
+	}
+
+	/*
+	 * Do not schedule replies from local real server. It is risky
+	 * for fwmark services but mostly for persistent services.
+	 */
+	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+		IP_VS_DBG_PKT(12, pp, skb, 0,
+			      "Not scheduling reply for existing connection");
+		__ip_vs_conn_put(cp);
+		return NULL;
+	}
+
+	/*
 	 *    Persistent service
 	 */
-	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+		*ignored = 0;
 		return ip_vs_sched_persist(svc, skb, pptr);
+	}
 
 	/*
 	 *    Non-persistent service
@@ -372,6 +400,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
 		return NULL;
 	}
 
+	*ignored = 0;
+
 	dest = svc->scheduler->schedule(svc, skb);
 	if (dest == NULL) {
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4c0855c..9ab5232 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	if ((sch->type == SCTP_CID_INIT) &&
 	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				     &iph.daddr, sh->dest))) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 64dc295..85d80a6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return 0;
 	}
 
+	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	if (th->syn &&
 	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
 				     th->dest))) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9c558c4..5d21f08 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				&iph.daddr, uh->dest);
 	if (svc) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
-- 
cgit v1.1


From 489fdedaed5ddb437dd2840eb93df37a6dd8c7de Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:27:31 +0300
Subject: ipvs: stop ICMP from FORWARD to local

 	Delivering locally ICMP from FORWARD hook is not supported.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0090d6d..27ecb25 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -48,6 +48,7 @@
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
 #endif
 
 #include <net/ip_vs.h>
@@ -1191,7 +1192,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-	/* do not touch skb anymore */
+	/* LOCALNODE from FORWARD hook is not supported */
+	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+	    skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+		IP_VS_DBG(1, "%s(): "
+			  "local delivery to %pI4 but in FORWARD\n",
+			  __func__, &skb_rtable(skb)->rt_dst);
+		verdict = NF_DROP;
+	}
 
   out:
 	__ip_vs_conn_put(cp);
@@ -1212,6 +1220,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct ip_vs_protocol *pp;
 	unsigned int offset, verdict;
 	union nf_inet_addr snet;
+	struct rt6_info *rt;
 
 	*related = 1;
 
@@ -1290,7 +1299,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	    IPPROTO_SCTP == cih->nexthdr)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-	/* do not touch skb anymore */
+	/* LOCALNODE from FORWARD hook is not supported */
+	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+	    (rt = (struct rt6_info *) skb_dst(skb)) &&
+	    rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "local delivery to %pI6 but in FORWARD\n",
+			  __func__, &rt->rt6i_dst);
+		verdict = NF_DROP;
+	}
 
 	__ip_vs_conn_put(cp);
 
-- 
cgit v1.1


From 4256f1aaa662697c1faa0984b7a698c2c8c57735 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:29:40 +0300
Subject: ipvs: fix CHECKSUM_PARTIAL for TUN method

 	The recent change in IP_VS_XMIT_TUNNEL to set
CHECKSUM_NONE is not correct. After adding IPIP header
skb->csum becomes invalid but the CHECKSUM_PARTIAL
case must be supported. So, use skb_forward_csum() which is
most suitable for us to allow local clients to send IPIP
to remote real server.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 94b53b4..63cc0fe 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -222,7 +222,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 		__ret = ip_vs_confirm_conntrack(skb, cp);	\
 	if (__ret == NF_ACCEPT) {				\
 		nf_reset(skb);					\
-		(skb)->ip_summed = CHECKSUM_NONE;		\
+		skb_forward_csum(skb);				\
 	}							\
 	__ret;							\
 })
-- 
cgit v1.1


From 1ca5bb5450aa2401fa272efeb741ebb260d0fbb0 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:32:29 +0300
Subject: ipvs: create ip_vs_defrag_user

 	Create new function ip_vs_defrag_user to return correct
IP_DEFRAG_xxx user depending on the hooknum. It will be needed
when we add handlers in LOCAL_OUT.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 55 +++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 21 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 27ecb25..f7f5283 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -543,6 +543,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 }
 
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+	if (NF_INET_LOCAL_IN == hooknum)
+		return IP_DEFRAG_VS_IN;
+	if (NF_INET_FORWARD == hooknum)
+		return IP_DEFRAG_VS_FWD;
+	return IP_DEFRAG_VS_OUT;
+}
+
 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
 	int err = ip_defrag(skb, user);
@@ -714,7 +723,8 @@ out:
  *	Find any that might be relevant, check against existing connections.
  *	Currently handles error types - unreachable, quench, ttl exceeded.
  */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+			  unsigned int hooknum)
 {
 	struct iphdr *iph;
 	struct icmphdr	_icmph, *ic;
@@ -729,7 +739,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 
 	/* reassemble IP fragments */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -788,7 +798,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+			     unsigned int hooknum)
 {
 	struct ipv6hdr *iph;
 	struct icmp6hdr	_icmph, *ic;
@@ -804,7 +815,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 
 	/* reassemble IP fragments */
 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -986,7 +997,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+			int related;
+			int verdict = ip_vs_out_icmp_v6(skb, &related,
+							hooknum);
 
 			if (related) {
 				if (sysctl_ip_vs_snat_reroute &&
@@ -1000,7 +1013,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	} else
 #endif
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_out_icmp(skb, &related);
+			int related;
+			int verdict = ip_vs_out_icmp(skb, &related, hooknum);
 
 			if (related) {
 				if (sysctl_ip_vs_snat_reroute &&
@@ -1019,19 +1033,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	/* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-			if (related)
-				return verdict;
-
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+			if (ip_vs_gather_frags_v6(skb,
+						  ip_vs_defrag_user(hooknum)))
+				return NF_STOLEN;
 		}
+
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 	} else
 #endif
 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
 			     !pp->dont_defrag)) {
-			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+			if (ip_vs_gather_frags(skb,
+					       ip_vs_defrag_user(hooknum)))
 				return NF_STOLEN;
 
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1114,8 +1128,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	/* reassemble IP fragments */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -1226,9 +1239,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	/* reassemble IP fragments */
 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-					       IP_DEFRAG_VS_IN :
-					       IP_DEFRAG_VS_FWD))
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -1349,7 +1360,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+			int related;
+			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -1358,7 +1370,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	} else
 #endif
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+			int related;
+			int verdict = ip_vs_in_icmp(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
-- 
cgit v1.1


From f5a41847acc535e2e2018e397b1876ba7577d9d9 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:35:46 +0300
Subject: ipvs: move ip_route_me_harder for ICMP

 	Currently, ip_route_me_harder after ip_vs_out_icmp
is called even if packet is not related to IPVS connection.
Move it into handle_response_icmp. Also, force rerouting
if sending to local client because IPv4 stack uses addresses
from the route.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f7f5283..c4f091d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -702,6 +702,17 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 #endif
 		ip_vs_nat_icmp(skb, pp, cp, 1);
 
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+			goto out;
+	} else
+#endif
+		if ((sysctl_ip_vs_snat_reroute ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto out;
+
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
@@ -940,16 +951,16 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	 * if it came from this machine itself.  So re-compute
 	 * the routing information.
 	 */
-	if (sysctl_ip_vs_snat_reroute) {
 #ifdef CONFIG_IP_VS_IPV6
-		if (af == AF_INET6) {
-			if (ip6_route_me_harder(skb) != 0)
-				goto drop;
-		} else
+	if (af == AF_INET6) {
+		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+			goto drop;
+	} else
 #endif
-			if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-				goto drop;
-	}
+		if ((sysctl_ip_vs_snat_reroute ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto drop;
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 
@@ -1001,13 +1012,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 			int verdict = ip_vs_out_icmp_v6(skb, &related,
 							hooknum);
 
-			if (related) {
-				if (sysctl_ip_vs_snat_reroute &&
-					NF_ACCEPT == verdict &&
-					ip6_route_me_harder(skb))
-					verdict = NF_DROP;
+			if (related)
 				return verdict;
-			}
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 		}
 	} else
@@ -1016,13 +1022,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 			int related;
 			int verdict = ip_vs_out_icmp(skb, &related, hooknum);
 
-			if (related) {
-				if (sysctl_ip_vs_snat_reroute &&
-					NF_ACCEPT == verdict &&
-					ip_route_me_harder(skb, RTN_LOCAL))
-					verdict = NF_DROP;
+			if (related)
 				return verdict;
-			}
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 		}
 
-- 
cgit v1.1


From fc604767613b6d2036cdc35b660bc39451040a47 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:38:15 +0300
Subject: ipvs: changes for local real server

 	This patch deals with local real servers:

- Add support for DNAT to local address (different real server port).
It needs ip_vs_out hook in LOCAL_OUT for both families because
skb->protocol is not set for locally generated packets and can not
be used to set 'af'.

- Skip packets in ip_vs_in marked with skb->ipvs_property because
ip_vs_out processing can be executed in LOCAL_OUT but we still
have the conn_out_get check in ip_vs_in.

- Ignore packets with inet->nodefrag from local stack

- Require skb_dst(skb) != NULL because we use it to get struct net

- Add support for changing the route to local IPv4 stack after DNAT
depending on the source address type. Local client sets output
route and the remote client sets input route. It looks like
IPv6 does not need such rerouting because the replies use
addresses from initial incoming header, not from skb route.

- All transmitters now have strict checks for the destination
address type: redirect from non-local address to local real
server requires NAT method, local address can not be used as
source address when talking to remote real server.

- Now LOCALNODE is not set explicitly as forwarding
method in real server to allow the connections to provide
correct forwarding method to the backup server. Not sure if
this breaks tools that expect to see 'Local' real server type.
If needed, this can be supported with new flag IP_VS_DEST_F_LOCAL.
Now it should be possible connections in backup that lost
their fwmark information during sync to be forwarded properly
to their daddr, even if it is local address in the backup server.
By this way backup could be used as real server for DR or TUN,
for NAT there are some restrictions because tuple collisions
in conntracks can create problems for the traffic.

- Call ip_vs_dst_reset when destination is updated in case
some real server IP type is changed between local and remote.

[ horms@verge.net.au: removed trailing whitespace ]
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 123 ++++++++++--
 net/netfilter/ipvs/ip_vs_ctl.c  |  18 +-
 net/netfilter/ipvs/ip_vs_xmit.c | 433 ++++++++++++++++++++++++++++++++--------
 3 files changed, 457 insertions(+), 117 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c4f091d..a6c8aff 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -984,26 +984,34 @@ drop:
 }
 
 /*
- *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
  *	Check if outgoing packet belongs to the established ip_vs_conn.
  */
 static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-	  const struct net_device *in, const struct net_device *out,
-	  int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 {
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int af;
 
 	EnterFunction(11);
 
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
+
+	if (unlikely(!skb_dst(skb)))
+		return NF_ACCEPT;
+
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
@@ -1106,6 +1114,69 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	return handle_response(af, skb, pp, cp, iph.len);
 }
 
+/*
+ *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
+}
+
+#endif
 
 /*
  *	Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1342,6 +1413,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	struct ip_vs_conn *cp;
 	int ret, restart, af, pkts;
 
+	/* Already marked as IPVS request or reply? */
+	if (skb->ipvs_property)
+		return NF_ACCEPT;
+
 	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1525,13 +1600,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_LOCAL_IN,
 		.priority       = 100,
 	},
-	/* After packet filtering, change source only for VS/NAT */
+	/* Before ip_vs_in, change source only for VS/NAT */
 	{
-		.hook		= ip_vs_out,
+		.hook		= ip_vs_local_reply4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -99,
 	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1542,6 +1617,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
+	},
 #ifdef CONFIG_IP_VS_IPV6
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
@@ -1553,13 +1636,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_LOCAL_IN,
 		.priority       = 100,
 	},
-	/* After packet filtering, change source only for VS/NAT */
+	/* Before ip_vs_in, change source only for VS/NAT */
 	{
-		.hook		= ip_vs_out,
+		.hook		= ip_vs_local_reply6,
 		.owner		= THIS_MODULE,
-		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -99,
 	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1570,6 +1653,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum        = NF_INET_FORWARD,
 		.priority       = 99,
 	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
+	},
 #endif
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0b884d3..5f5daa3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -777,20 +777,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
 	conn_flags |= IP_VS_CONN_F_INACTIVE;
 
-	/* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6) {
-		if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
-	} else
-#endif
-		if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
-
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
 	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
@@ -824,6 +810,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	dest->u_threshold = udest->u_threshold;
 	dest->l_threshold = udest->l_threshold;
 
+	spin_lock(&dest->dst_lock);
+	ip_vs_dst_reset(dest);
+	spin_unlock(&dest->dst_lock);
+
 	if (add)
 		ip_vs_new_estimator(&dest->stats);
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 63cc0fe..8608882 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -67,12 +67,19 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 	return dst;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *	    &4=Allow redirect from remote daddr to local
+ */
 static struct rtable *
-__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+		   __be32 daddr, u32 rtos, int rt_mode)
 {
-	struct net *net = dev_net(skb->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rtable *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
+	struct rtable *ort;			/* Original route */
+	int local;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
@@ -104,23 +111,95 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
 			.oif = 0,
 			.nl_u = {
 				.ip4_u = {
-					.daddr = cp->daddr.ip,
+					.daddr = daddr,
 					.saddr = 0,
 					.tos = rtos, } },
 		};
 
 		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
-				     &cp->daddr.ip);
+				     &daddr);
 			return NULL;
 		}
 	}
 
+	local = rt->rt_flags & RTCF_LOCAL;
+	if (!((local ? 1 : 2) & rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+			     (rt->rt_flags & RTCF_LOCAL) ?
+			     "local":"non-local", &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+					 ort->rt_flags & RTCF_LOCAL)) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+			     "requires NAT method, dest: %pI4\n",
+			     &ip_hdr(skb)->daddr, &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+			     "to non-local address, dest: %pI4\n",
+			     &ip_hdr(skb)->saddr, &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+
 	return rt;
 }
 
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+	struct net *net = dev_net(dev);
+	struct iphdr *iph = ip_hdr(skb);
+
+	if (rt->fl.iif) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
+			return 0;
+		refdst_drop(orefdst);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos),
+				}
+			},
+			.mark = skb->mark,
+		};
+		struct rtable *rt;
+
+		if (ip_route_output_key(net, &rt, &fl))
+			return 0;
+		if (!(rt->rt_flags & RTCF_LOCAL)) {
+			ip_rt_put(rt);
+			return 0;
+		}
+		/* Drop old route. */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	}
+	return 1;
+}
+
 #ifdef CONFIG_IP_VS_IPV6
 
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+	return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
 static struct dst_entry *
 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
 			struct in6_addr *ret_saddr, int do_xfrm)
@@ -155,14 +234,21 @@ out_err:
 	return NULL;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *	    &4=Allow redirect from remote daddr to local
+ */
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		      struct in6_addr *ret_saddr, int do_xfrm)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
+		      int do_xfrm, int rt_mode)
 {
-	struct net *net = dev_net(skb->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rt6_info *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
+	struct rt6_info *ort;			/* Original route */
 	struct dst_entry *dst;
+	int local;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
@@ -188,13 +274,38 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
-					      do_xfrm);
+		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
 		if (!dst)
 			return NULL;
 		rt = (struct rt6_info *) dst;
 	}
 
+	local = __ip_vs_is_local_route6(rt);
+	if (!((local ? 1 : 2) & rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+			     local ? "local":"non-local", daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (local && !(rt_mode & 4) &&
+	    !((ort = (struct rt6_info *) skb_dst(skb)) &&
+	      __ip_vs_is_local_route6(ort))) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+			     "requires NAT method, dest: %pI6\n",
+			     &ipv6_hdr(skb)->daddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+		     ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+				    IPV6_ADDR_LOOPBACK)) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+			     "to non-local address, dest: %pI6\n",
+			     &ipv6_hdr(skb)->saddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+
 	return rt;
 }
 #endif
@@ -227,23 +338,27 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 	__ret;							\
 })
 
-#define IP_VS_XMIT_NAT(pf, skb, cp)				\
+#define IP_VS_XMIT_NAT(pf, skb, cp, local)		\
 do {							\
 	(skb)->ipvs_property = 1;			\
 	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
 		ip_vs_notrack(skb);			\
 	else						\
 		ip_vs_update_conntrack(skb, cp, 1);	\
+	if (local)					\
+		return NF_ACCEPT;			\
 	skb_forward_csum(skb);				\
 	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
 		skb_dst(skb)->dev, dst_output);		\
 } while (0)
 
-#define IP_VS_XMIT(pf, skb, cp)				\
+#define IP_VS_XMIT(pf, skb, cp, local)			\
 do {							\
 	(skb)->ipvs_property = 1;			\
 	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
 		ip_vs_notrack(skb);			\
+	if (local)					\
+		return NF_ACCEPT;			\
 	skb_forward_csum(skb);				\
 	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
 		skb_dst(skb)->dev, dst_output);		\
@@ -258,7 +373,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct ip_vs_protocol *pp)
 {
 	/* we do not touch skb and do not need pskb ptr */
-	return NF_ACCEPT;
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
 }
 
 
@@ -271,27 +386,15 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  struct ip_vs_protocol *pp)
 {
-	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
-	u8     tos = iph->tos;
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip4_u = {
-				.daddr = iph->daddr,
-				.saddr = 0,
-				.tos = RT_TOS(tos), } },
-	};
 
 	EnterFunction(10);
 
-	if (ip_route_output_key(net, &rt, &fl)) {
-		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
-			     __func__, &iph->daddr);
+	if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+				      RT_TOS(iph->tos), 2)))
 		goto tx_error_icmp;
-	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -319,7 +422,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -337,18 +440,14 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
-	struct net *net = dev_net(skb->dev);
-	struct dst_entry *dst;
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
 
 	EnterFunction(10);
 
-	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
-	if (!dst)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
 		goto tx_error_icmp;
-	rt = (struct rt6_info *) dst;
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -376,7 +475,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -401,6 +500,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable *rt;		/* Route to the other host */
 	int mtu;
 	struct iphdr *iph = ip_hdr(skb);
+	int local;
 
 	EnterFunction(10);
 
@@ -414,16 +514,40 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos), 1|2|4)))
 		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+		IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -433,16 +557,27 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-		goto tx_error;
+		goto tx_error_put;
 	ip_hdr(skb)->daddr = cp->daddr.ip;
 	ip_send_check(ip_hdr(skb));
 
+	if (!local) {
+		/* drop old route */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
@@ -452,7 +587,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -475,6 +610,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rt6_info *rt;		/* Route to the other host */
 	int mtu;
+	int local;
 
 	EnterFunction(10);
 
@@ -489,18 +625,44 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2|4)))
 		goto tx_error_icmp;
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, pp, skb, 0,
+					 "ip_vs_nat_xmit_v6(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG_RL_PKT(1, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->dst);
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): frag needed for");
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -510,14 +672,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
 		goto tx_error;
-	ipv6_hdr(skb)->daddr = cp->daddr.in6;
+	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
+
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
 
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 
@@ -528,7 +695,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -588,16 +755,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(tos), 1|2)))
 		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	tdev = rt->dst.dev;
 
 	mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 	if (mtu < 68) {
-		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -607,9 +778,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if ((old_iph->frag_off & htons(IP_DF))
 	    && mtu < ntohs(old_iph->tot_len)) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/*
@@ -678,6 +848,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
+tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -703,27 +876,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+					 &saddr, 1, 1|2)))
 		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	tdev = rt->dst.dev;
 
 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
 	if (mtu < IPV6_MIN_MTU) {
-		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
 			     IPV6_MIN_MTU);
-		goto tx_error;
+		goto tx_error_put;
 	}
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/*
@@ -789,6 +964,9 @@ tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->dst);
+	goto tx_error;
 }
 #endif
 
@@ -807,8 +985,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos), 1|2)))
 		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -836,7 +1019,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -859,9 +1042,13 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2)))
 		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -889,7 +1076,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -915,6 +1102,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable	*rt;	/* Route to the other host */
 	int mtu;
 	int rc;
+	int local;
 
 	EnterFunction(10);
 
@@ -935,16 +1123,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
 		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI4\n",
+				  __func__, &cp->daddr.ip);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI4\n",
+			  __func__, &cp->daddr.ip);
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -954,16 +1169,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop the old route when skb is not shared */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
+	if (!local) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
 	rc = NF_STOLEN;
 	goto out;
@@ -989,6 +1215,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info	*rt;	/* Route to the other host */
 	int mtu;
 	int rc;
+	int local;
 
 	EnterFunction(10);
 
@@ -1009,17 +1236,44 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2|4)))
 		goto tx_error_icmp;
 
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI6\n",
+				  __func__, &cp->daddr.in6);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI6\n",
+			  __func__, &cp->daddr.in6);
+		goto tx_error_put;
+	}
+
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->dst);
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
@@ -1029,16 +1283,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop the old route when skb is not shared */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
+
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
 	rc = NF_STOLEN;
 	goto out;
-- 
cgit v1.1


From cb59155f21d4c0507d2034c2953f6a3f7806913d Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:40:51 +0300
Subject: ipvs: changes for local client

	This patch deals with local client processing.

	Prefer LOCAL_OUT hook for scheduling connections from
local clients. LOCAL_IN is still supported if the packets are
not marked as processed in LOCAL_OUT. The idea to process
requests in LOCAL_OUT is to alter conntrack reply before
it is confirmed at POST_ROUTING. If the local requests are
processed in LOCAL_IN the conntrack can not be updated
and matching by state is impossible.

	Add the following handlers:

- ip_vs_reply[46] at LOCAL_IN:99 to process replies from
remote real servers to local clients. Now when both
replies from remote real servers (ip_vs_reply*) and
local real servers (ip_vs_local_reply*) are handled
it is safe to remove the conn_out_get call from ip_vs_in
because it does not support related ICMP packets.

- ip_vs_local_request[46] at LOCAL_OUT:-98 to process
requests from local client

	Handling in LOCAL_OUT causes some changes:

- as skb->dev, skb->protocol and skb->pkt_type are not defined
in LOCAL_OUT make sure we set skb->dev before calling icmpv6_send,
prefer skb_dst(skb) for struct net and remove the skb->protocol
checks from TUN transmitters.

[ horms@verge.net.au: removed trailing whitespace ]
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 266 ++++++++++++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_xmit.c |  51 +++++---
 2 files changed, 225 insertions(+), 92 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index a6c8aff..5fbcf67 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -529,9 +529,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 	 */
 #ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6)
+	if (svc->af == AF_INET6) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
-	else
+	} else
 #endif
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
@@ -1065,57 +1070,61 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	 */
 	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
-	if (unlikely(!cp)) {
-		if (sysctl_ip_vs_nat_icmp_send &&
-		    (pp->protocol == IPPROTO_TCP ||
-		     pp->protocol == IPPROTO_UDP ||
-		     pp->protocol == IPPROTO_SCTP)) {
-			__be16 _ports[2], *pptr;
-
-			pptr = skb_header_pointer(skb, iph.len,
-						  sizeof(_ports), _ports);
-			if (pptr == NULL)
-				return NF_ACCEPT;	/* Not for me */
-			if (ip_vs_lookup_real_service(af, iph.protocol,
-						      &iph.saddr,
-						      pptr[0])) {
-				/*
-				 * Notify the real server: there is no
-				 * existing entry if it is not RST
-				 * packet or not TCP packet.
-				 */
-				if ((iph.protocol != IPPROTO_TCP &&
-				     iph.protocol != IPPROTO_SCTP)
-				     || ((iph.protocol == IPPROTO_TCP
-					  && !is_tcp_reset(skb, iph.len))
-					 || (iph.protocol == IPPROTO_SCTP
-						&& !is_sctp_abort(skb,
-							iph.len)))) {
+	if (likely(cp))
+		return handle_response(af, skb, pp, cp, iph.len);
+	if (sysctl_ip_vs_nat_icmp_send &&
+	    (pp->protocol == IPPROTO_TCP ||
+	     pp->protocol == IPPROTO_UDP ||
+	     pp->protocol == IPPROTO_SCTP)) {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, iph.len,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			return NF_ACCEPT;	/* Not for me */
+		if (ip_vs_lookup_real_service(af, iph.protocol,
+					      &iph.saddr,
+					      pptr[0])) {
+			/*
+			 * Notify the real server: there is no
+			 * existing entry if it is not RST
+			 * packet or not TCP packet.
+			 */
+			if ((iph.protocol != IPPROTO_TCP &&
+			     iph.protocol != IPPROTO_SCTP)
+			     || ((iph.protocol == IPPROTO_TCP
+				  && !is_tcp_reset(skb, iph.len))
+				 || (iph.protocol == IPPROTO_SCTP
+					&& !is_sctp_abort(skb,
+						iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
-					if (af == AF_INET6)
-						icmpv6_send(skb,
-							    ICMPV6_DEST_UNREACH,
-							    ICMPV6_PORT_UNREACH,
-							    0);
-					else
+				if (af == AF_INET6) {
+					struct net *net =
+						dev_net(skb_dst(skb)->dev);
+
+					if (!skb->dev)
+						skb->dev = net->loopback_dev;
+					icmpv6_send(skb,
+						    ICMPV6_DEST_UNREACH,
+						    ICMPV6_PORT_UNREACH,
+						    0);
+				} else
 #endif
-						icmp_send(skb,
-							  ICMP_DEST_UNREACH,
-							  ICMP_PORT_UNREACH, 0);
-					return NF_DROP;
-				}
+					icmp_send(skb,
+						  ICMP_DEST_UNREACH,
+						  ICMP_PORT_UNREACH, 0);
+				return NF_DROP;
 			}
 		}
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
-		return NF_ACCEPT;
 	}
-
-	return handle_response(af, skb, pp, cp, iph.len);
+	IP_VS_DBG_PKT(12, pp, skb, 0,
+		      "ip_vs_out: packet continues traversal as normal");
+	return NF_ACCEPT;
 }
 
 /*
- *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
@@ -1147,7 +1156,8 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
 #ifdef CONFIG_IP_VS_IPV6
 
 /*
- *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
@@ -1404,34 +1414,43 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
  *	and send it on its way...
  */
 static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-	 const struct net_device *in, const struct net_device *out,
-	 int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int ret, restart, af, pkts;
+	int ret, restart, pkts;
 
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
 	/*
-	 *	Big tappo: only PACKET_HOST, including loopback for local client
-	 *	Don't handle local packets on IPv6 for now
+	 *	Big tappo:
+	 *	- remote client: only PACKET_HOST
+	 *	- route: used for struct net when skb->dev is unset
 	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST)) {
-		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-			      skb->pkt_type,
-			      iph.protocol,
-			      IP_VS_DBG_ADDR(af, &iph.daddr));
+	if (unlikely((skb->pkt_type != PACKET_HOST &&
+		      hooknum != NF_INET_LOCAL_OUT) ||
+		     !skb_dst(skb))) {
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+			      " ignored in hook %u\n",
+			      skb->pkt_type, iph.protocol,
+			      IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
 		return NF_ACCEPT;
 	}
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
@@ -1467,11 +1486,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(!cp)) {
 		int v;
 
-		/* For local client packets, it could be a response */
-		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-		if (cp)
-			return handle_response(af, skb, pp, cp, iph.len);
-
 		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
 			return v;
 	}
@@ -1479,7 +1493,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(!cp)) {
 		/* sorry, all this trouble for a no-hit :) */
 		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
+			      "ip_vs_in: packet continues traversal as normal");
 		return NF_ACCEPT;
 	}
 
@@ -1550,6 +1564,72 @@ out:
 	return ret;
 }
 
+/*
+ *	AF_INET handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *	AF_INET handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
+}
+
+#endif
+
 
 /*
  *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1590,15 +1670,23 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 99,
+	},
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
 	 * applied to IPVS. */
 	{
-		.hook		= ip_vs_in,
+		.hook		= ip_vs_remote_request4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 101,
 	},
 	/* Before ip_vs_in, change source only for VS/NAT */
 	{
@@ -1608,14 +1696,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= -99,
 	},
+	/* After mangle, schedule and forward local requests */
+	{
+		.hook		= ip_vs_local_request4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -98,
+	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
 	{
 		.hook		= ip_vs_forward_icmp,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
 	},
 	/* After packet filtering, change source only for VS/NAT */
 	{
@@ -1626,15 +1722,23 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.priority	= 100,
 	},
 #ifdef CONFIG_IP_VS_IPV6
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 99,
+	},
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
 	 * applied to IPVS. */
 	{
-		.hook		= ip_vs_in,
+		.hook		= ip_vs_remote_request6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 101,
 	},
 	/* Before ip_vs_in, change source only for VS/NAT */
 	{
@@ -1644,14 +1748,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= -99,
 	},
+	/* After mangle, schedule and forward local requests */
+	{
+		.hook		= ip_vs_local_request6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -98,
+	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
 	{
 		.hook		= ip_vs_forward_icmp_v6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
 	},
 	/* After packet filtering, change source only for VS/NAT */
 	{
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 8608882..97b5361 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
  *
  * Changes:
  *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -452,8 +462,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error;
 	}
@@ -659,6 +674,11 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): frag needed for");
@@ -748,13 +768,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (skb->protocol != htons(ETH_P_IP)) {
-		IP_VS_DBG_RL("%s(): protocol error, "
-			     "ETH_P_IP: %d, skb protocol: %d\n",
-			     __func__, htons(ETH_P_IP), skb->protocol);
-		goto tx_error;
-	}
-
 	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
 				      RT_TOS(tos), 1|2)))
 		goto tx_error_icmp;
@@ -869,13 +882,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (skb->protocol != htons(ETH_P_IPV6)) {
-		IP_VS_DBG_RL("%s(): protocol error, "
-			     "ETH_P_IPV6: %d, skb protocol: %d\n",
-			     __func__, htons(ETH_P_IPV6), skb->protocol);
-		goto tx_error;
-	}
-
 	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
 					 &saddr, 1, 1|2)))
 		goto tx_error_icmp;
@@ -896,6 +902,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error_put;
@@ -1053,6 +1064,11 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1271,6 +1287,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error_put;
-- 
cgit v1.1


From 3233759be7eeca9998c514b8f49e8cf2b85e64d3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:43:36 +0300
Subject: ipvs: inherit forwarding method in backup

 	Connections in backup server should inherit the
forwarding method from real server. It is a way to fix a
problem where the forwarding method in backup connection
is damaged by logical OR operation with the real server's
connection flags. And the change is needed for setups
where the backup server uses different forwarding method
for the same real servers.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_conn.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 1d1a529..e9adecd 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -563,6 +563,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 		 */
 		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
 			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+		/* connections inherit forwarding method from dest */
+		cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
 	}
 	cp->flags |= conn_flags;
 	cp->dest = dest;
-- 
cgit v1.1


From 0d79641a96d612aaa6d57a4d4f521d7ed9c9ccdd Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 17 Oct 2010 16:46:17 +0300
Subject: ipvs: provide address family for debugging

 	As skb->protocol is not valid in LOCAL_OUT add
parameter for address family in packet debugging functions.
Even if ports are not present in AH and ESP change them to
use ip_vs_tcpudp_debug_packet to show at least valid addresses
as before. This patch removes the last user of skb->protocol
in IPVS.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c         | 41 +++++++++++++++-----------
 net/netfilter/ipvs/ip_vs_proto.c        |  8 ++---
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 52 ++-------------------------------
 net/netfilter/ipvs/ip_vs_proto_sctp.c   |  2 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c    |  4 +--
 net/netfilter/ipvs/ip_vs_proto_udp.c    |  4 +--
 net/netfilter/ipvs/ip_vs_xmit.c         | 18 +++++++-----
 7 files changed, 45 insertions(+), 84 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5fbcf67..b4e51e9 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -365,7 +365,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	 * with persistence the connection is created on SYN+ACK.
 	 */
 	if (pptr[0] == FTPDATA) {
-		IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA");
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling FTPDATA");
 		return NULL;
 	}
 
@@ -376,7 +377,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
 	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
 	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
-		IP_VS_DBG_PKT(12, pp, skb, 0,
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
 			      "Not scheduling reply for existing connection");
 		__ip_vs_conn_put(cp);
 		return NULL;
@@ -617,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 			"Forwarding altered outgoing ICMP");
 	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 			"Forwarding altered incoming ICMP");
 }
 
@@ -662,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	skb->ip_summed = CHECKSUM_PARTIAL;
 
 	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered outgoing ICMPv6");
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered outgoing ICMPv6");
 	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered incoming ICMPv6");
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered incoming ICMPv6");
 }
 #endif
 
@@ -798,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
 		     pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking outgoing ICMP for");
 
 	offset += cih->ihl * 4;
 
@@ -874,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking outgoing ICMPv6 for");
 
 	offset += sizeof(struct ipv6hdr);
 
@@ -922,7 +927,7 @@ static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		struct ip_vs_conn *cp, int ihl)
 {
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
 	if (!skb_make_writable(skb, ihl))
 		goto drop;
@@ -967,7 +972,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			goto drop;
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
 
 	ip_vs_out_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
@@ -1117,7 +1122,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 			}
 		}
 	}
-	IP_VS_DBG_PKT(12, pp, skb, 0,
+	IP_VS_DBG_PKT(12, af, pp, skb, 0,
 		      "ip_vs_out: packet continues traversal as normal");
 	return NF_ACCEPT;
 }
@@ -1253,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 		     pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking incoming ICMP for");
 
 	offset += cih->ihl * 4;
 
@@ -1364,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking incoming ICMPv6 for");
 
 	offset += sizeof(struct ipv6hdr);
 
@@ -1492,12 +1499,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 
 	if (unlikely(!cp)) {
 		/* sorry, all this trouble for a no-hit :) */
-		IP_VS_DBG_PKT(12, pp, skb, 0,
+		IP_VS_DBG_PKT(12, af, pp, skb, 0,
 			      "ip_vs_in: packet continues traversal as normal");
 		return NF_ACCEPT;
 	}
 
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
 
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 027f654..c539983 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 	else if (ih->frag_off & htons(IP_OFFSET))
 		sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
 	else {
-		__be16 _ports[2], *pptr
-;
+		__be16 _ports[2], *pptr;
+
 		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 
 void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
 			  const struct sk_buff *skb,
 			  int offset,
 			  const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
+	if (af == AF_INET6)
 		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
 	else
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 8956ef3..3a04611 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -117,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	return 0;
 }
 
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "TRUNCATED");
-	else
-		sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
-	pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct ipv6hdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "TRUNCATED");
-	else
-		sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
-	pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		    int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
-		ah_esp_debug_packet_v6(pp, skb, offset, msg);
-	else
-#endif
-		ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
 static void ah_esp_init(struct ip_vs_protocol *pp)
 {
 	/* nothing to do now */
@@ -195,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 	.set_state_timeout =	NULL,
 };
@@ -219,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 };
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 9ab5232..d254345 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -176,7 +176,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 
 	if (val != cmp) {
 		/* CRC failure, dump it. */
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 				"Failed checksum for");
 		return 0;
 	}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 85d80a6..f6c5200 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -300,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 					    skb->len - tcphoff,
 					    ipv6_hdr(skb)->nexthdr,
 					    skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 						 "Failed checksum for");
 				return 0;
 			}
@@ -311,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 					      skb->len - tcphoff,
 					      ip_hdr(skb)->protocol,
 					      skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 						 "Failed checksum for");
 				return 0;
 			}
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 5d21f08..9d106a0 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -314,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 						    skb->len - udphoff,
 						    ipv6_hdr(skb)->nexthdr,
 						    skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 							 "Failed checksum for");
 					return 0;
 				}
@@ -325,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 						      skb->len - udphoff,
 						      ip_hdr(skb)->protocol,
 						      skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 							 "Failed checksum for");
 					return 0;
 				}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 97b5361..de04ea3 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
 
 		if (ct && !nf_ct_is_untracked(ct)) {
-			IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): "
+			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+					 "ip_vs_nat_xmit(): "
 					 "stopping DNAT to local address");
 			goto tx_error_put;
 		}
@@ -552,7 +553,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* From world but DNAT to loopback address? */
 	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
-		IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): "
+		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
 				 "stopping DNAT to loopback address");
 		goto tx_error_put;
 	}
@@ -561,7 +562,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+				 "ip_vs_nat_xmit(): frag needed for");
 		goto tx_error_put;
 	}
 
@@ -593,7 +595,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			goto tx_error;
 	}
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
@@ -654,7 +656,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
 
 		if (ct && !nf_ct_is_untracked(ct)) {
-			IP_VS_DBG_RL_PKT(10, pp, skb, 0,
+			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
 					 "ip_vs_nat_xmit_v6(): "
 					 "stopping DNAT to local address");
 			goto tx_error_put;
@@ -665,7 +667,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* From world but DNAT to loopback address? */
 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
 	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
-		IP_VS_DBG_RL_PKT(1, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): "
 				 "stopping DNAT to loopback address");
 		goto tx_error_put;
@@ -680,7 +682,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			skb->dev = net->loopback_dev;
 		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): frag needed for");
 		goto tx_error_put;
 	}
@@ -706,7 +708,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		dst_release(&rt->dst);
 	}
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
-- 
cgit v1.1


From 106e4c26b1529e559d1aae777f11b4f8f7bafc26 Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Thu, 21 Oct 2010 12:45:14 +0200
Subject: tproxy: kick out TIME_WAIT sockets in case a new connection comes in
 with the same tuple

Without tproxy redirections an incoming SYN kicks out conflicting
TIME_WAIT sockets, in order to handle clients that reuse ports
within the TIME_WAIT period.

The same mechanism didn't work in case TProxy is involved in finding
the proper socket, as the time_wait processing code looked up the
listening socket assuming that the listener addr/port matches those
of the established connection.

This is not the case with TProxy as the listener addr/port is possibly
changed with the tproxy rule.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_tproxy_core.c | 29 +++++++++++++-----
 net/netfilter/xt_TPROXY.c      | 68 ++++++++++++++++++++++++++++++++++++++----
 net/netfilter/xt_socket.c      |  2 +-
 3 files changed, 85 insertions(+), 14 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 5490fc3..8589e5e 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -22,21 +22,34 @@ struct sock *
 nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, bool listening_only)
+		      const struct net_device *in, int lookup_type)
 {
 	struct sock *sk;
 
 	/* look up socket */
 	switch (protocol) {
 	case IPPROTO_TCP:
-		if (listening_only)
-			sk = __inet_lookup_listener(net, &tcp_hashinfo,
-						    daddr, ntohs(dport),
-						    in->ifindex);
-		else
+		switch (lookup_type) {
+		case NFT_LOOKUP_ANY:
 			sk = __inet_lookup(net, &tcp_hashinfo,
 					   saddr, sport, daddr, dport,
 					   in->ifindex);
+			break;
+		case NFT_LOOKUP_LISTENER:
+			sk = inet_lookup_listener(net, &tcp_hashinfo,
+						    daddr, dport,
+						    in->ifindex);
+			break;
+		case NFT_LOOKUP_ESTABLISHED:
+			sk = inet_lookup_established(net, &tcp_hashinfo,
+						    saddr, sport, daddr, dport,
+						    in->ifindex);
+			break;
+		default:
+			WARN_ON(1);
+			sk = NULL;
+			break;
+		}
 		break;
 	case IPPROTO_UDP:
 		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
@@ -47,8 +60,8 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 		sk = NULL;
 	}
 
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
 
 	return sk;
 }
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 21bb2af..e0b6900 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -24,6 +24,57 @@
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
+/**
+ * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct xt_tproxy_target_info *tgi = par->targinfo;
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					    iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
+					    hp->source, tgi->lport ? tgi->lport : hp->dest,
+					    par->in, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			/* yeah, there's one, let's kill the TIME_WAIT
+			 * socket and redirect to the listener
+			 */
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
 static unsigned int
 tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -37,11 +88,18 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		return NF_DROP;
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-				   iph->saddr,
-				   tgi->laddr ? tgi->laddr : iph->daddr,
-				   hp->source,
-				   tgi->lport ? tgi->lport : hp->dest,
-				   par->in, true);
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		sk = tproxy_handle_time_wait(skb, par, sk);
+	else if (!sk)
+		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
+					   hp->source, tgi->lport ? tgi->lport : hp->dest,
+					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && nf_tproxy_assign_sock(skb, sk)) {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1ca8990..266faa0 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -142,7 +142,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 #endif
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
-				   saddr, daddr, sport, dport, par->in, false);
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
 	if (sk != NULL) {
 		bool wildcard;
 		bool transparent = true;
-- 
cgit v1.1


From 6006db84a91838813cdad8a6622a4e39efe9ea47 Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Thu, 21 Oct 2010 12:47:34 +0200
Subject: tproxy: add lookup type checks for UDP in nf_tproxy_get_sock_v4()

Also, inline this function as the lookup_type is always a literal
and inlining removes branches performed at runtime.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_tproxy_core.c | 48 ------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 8589e5e..db65563 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -18,54 +18,6 @@
 #include <net/udp.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
-struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
-		      const __be32 saddr, const __be32 daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, int lookup_type)
-{
-	struct sock *sk;
-
-	/* look up socket */
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_ANY:
-			sk = __inet_lookup(net, &tcp_hashinfo,
-					   saddr, sport, daddr, dport,
-					   in->ifindex);
-			break;
-		case NFT_LOOKUP_LISTENER:
-			sk = inet_lookup_listener(net, &tcp_hashinfo,
-						    daddr, dport,
-						    in->ifindex);
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = inet_lookup_established(net, &tcp_hashinfo,
-						    saddr, sport, daddr, dport,
-						    in->ifindex);
-			break;
-		default:
-			WARN_ON(1);
-			sk = NULL;
-			break;
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
 
 static void
 nf_tproxy_destructor(struct sk_buff *skb)
-- 
cgit v1.1


From 6ad7889327a5ee6ab4220bd34e4428c7d0de0f32 Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Thu, 21 Oct 2010 16:17:26 +0200
Subject: tproxy: added IPv6 support to the TPROXY target

This requires a new revision as the old target structure was
IPv4 specific.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_TPROXY.c | 262 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 225 insertions(+), 37 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e0b6900..d5f97e2 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
 /*
  * Transparent proxy support for Linux/iptables
  *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
  * Author: Balazs Scheidler, Krisztian Kovacs
  *
  * This program is free software; you can redistribute it and/or modify
@@ -19,15 +19,18 @@
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
 /**
- * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
  * @skb:	The skb being processed.
- * @par:	Iptables target parameters.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
  * @sk:		The TIME_WAIT TCP socket found by the lookup.
  *
  * We have to handle SYN packets arriving to TIME_WAIT sockets
@@ -35,16 +38,16 @@
  * redirect the new connection to the proxy if there's a listener
  * socket present.
  *
- * tproxy_handle_time_wait() consumes the socket reference passed in.
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
  *
  * Returns the listener socket if there's one, the TIME_WAIT socket if
  * no such listener is found, or NULL if the TCP header is incomplete.
  */
 static struct sock *
-tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk)
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+			struct sock *sk)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct tcphdr _hdr, *hp;
 
 	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
@@ -59,13 +62,64 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par,
 		struct sock *sk2;
 
 		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-					    iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-					    hp->source, tgi->lport ? tgi->lport : hp->dest,
-					    par->in, NFT_LOOKUP_LISTENER);
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
 		if (sk2) {
-			/* yeah, there's one, let's kill the TIME_WAIT
-			 * socket and redirect to the listener
-			 */
 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
 			inet_twsk_put(inet_twsk(sk));
 			sk = sk2;
@@ -76,10 +130,10 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par,
 }
 
 static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+	   u_int32_t mark_mask, u_int32_t mark_value)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
 
@@ -87,18 +141,105 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (hp == NULL)
 		return NF_DROP;
 
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
-				   par->in, NFT_LOOKUP_ESTABLISHED);
+				   skb->dev, NFT_LOOKUP_ESTABLISHED);
 
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
-		sk = tproxy_handle_time_wait(skb, par, sk);
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
 	else if (!sk)
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-					   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-					   hp->source, tgi->lport ? tgi->lport : hp->dest,
+					   iph->saddr, laddr ? laddr : iph->daddr,
+					   hp->source, lport ? lport : hp->dest,
+					   skb->dev, NFT_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+		pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+			 iph->protocol, &iph->daddr, ntohs(hp->dest),
+			 &laddr, ntohs(lport), skb->mark);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %hhu %08x:%hu -> %08x:%hu, mark: %x\n",
+		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
+		 ntohl(laddr), ntohs(lport), skb->mark);
+	return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+	int thoff;
+	int tproto;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+	else if (!sk)
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					   &iph->saddr,
+					   !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
+					   hp->source,
+					   tgi->lport ? tgi->lport : hp->dest,
 					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
@@ -107,19 +248,33 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
 
-		pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-			 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-			 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+			 tproto, &iph->saddr, ntohs(hp->dest),
+			 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
 		return NF_ACCEPT;
 	}
 
-	pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-		 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+	pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+		 tproto, &iph->saddr, ntohs(hp->dest),
+		 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
 	return NF_DROP;
 }
 
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_ip6 *i = par->entryinfo;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->flags & IP6T_INV_PROTO))
+		return 0;
+
+	pr_info("Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct ipt_ip *i = par->entryinfo;
 
@@ -132,31 +287,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
 	return -EINVAL;
 }
 
-static struct xt_target tproxy_tg_reg __read_mostly = {
-	.name		= "TPROXY",
-	.family		= NFPROTO_IPV4,
-	.table		= "mangle",
-	.target		= tproxy_tg,
-	.targetsize	= sizeof(struct xt_tproxy_target_info),
-	.checkentry	= tproxy_tg_check,
-	.hooks		= 1 << NF_INET_PRE_ROUTING,
-	.me		= THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v0,
+		.revision	= 0,
+		.targetsize	= sizeof(struct xt_tproxy_target_info),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV6,
+		.table		= "mangle",
+		.target		= tproxy_tg6_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg6_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#endif
+
 };
 
 static int __init tproxy_tg_init(void)
 {
 	nf_defrag_ipv4_enable();
-	return xt_register_target(&tproxy_tg_reg);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	nf_defrag_ipv6_enable();
+#endif
+
+	return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 static void __exit tproxy_tg_exit(void)
 {
-	xt_unregister_target(&tproxy_tg_reg);
+	xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 module_init(tproxy_tg_init);
 module_exit(tproxy_tg_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
 MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
 MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
-- 
cgit v1.1


From b64c9256a9b76fc9f059f71bd08ba88fb0cbba2e Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Thu, 21 Oct 2010 16:19:42 +0200
Subject: tproxy: added IPv6 support to the socket match

The ICMP extraction bits were contributed by Harry Mason.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_socket.c | 165 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 154 insertions(+), 11 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 266faa0..2dbd4c8 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/icmp.h>
@@ -21,6 +22,7 @@
 #include <net/inet_sock.h>
 #include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
 #include <linux/netfilter/xt_socket.h>
 
@@ -30,7 +32,7 @@
 #endif
 
 static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
 		    u8 *protocol,
 		    __be32 *raddr,
 		    __be32 *laddr,
@@ -86,7 +88,6 @@ extract_icmp_fields(const struct sk_buff *skb,
 	return 0;
 }
 
-
 static bool
 socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	     const struct xt_socket_mtinfo1 *info)
@@ -115,7 +116,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		dport = hp->dest;
 
 	} else if (iph->protocol == IPPROTO_ICMP) {
-		if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
 					&sport, &dport))
 			return false;
 	} else {
@@ -165,32 +166,157 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 			sk = NULL;
 	}
 
-	pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport),
-		 ntohl(daddr), ntohs(dport),
-		 ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+	pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+		 protocol, &saddr, ntohs(sport),
+		 &daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
 
 	return (sk != NULL);
 }
 
 static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, NULL);
 }
 
 static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, par->matchinfo);
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+		     unsigned int outside_hdrlen,
+		     u8 *protocol,
+		     struct in6_addr **raddr,
+		     struct in6_addr **laddr,
+		     __be16 *rport,
+		     __be16 *lport)
+{
+	struct ipv6hdr *inside_iph, _inside_iph;
+	struct icmp6hdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+	u8 inside_nexthdr;
+	int inside_hdrlen;
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+		return 1;
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+	inside_nexthdr = inside_iph->nexthdr;
+
+	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+	if (inside_hdrlen < 0)
+		return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+	if (inside_nexthdr != IPPROTO_TCP &&
+	    inside_nexthdr != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, inside_hdrlen,
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_nexthdr;
+	*laddr = &inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = &inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	struct in6_addr *daddr, *saddr;
+	__be16 dport, sport;
+	int thoff;
+	u8 tproto;
+	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return false;
+
+		saddr = &iph->saddr;
+		sport = hp->source;
+		daddr = &iph->daddr;
+		dport = hp->dest;
+
+	} else if (tproto == IPPROTO_ICMPV6) {
+		if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+					 &sport, &dport))
+			return false;
+	} else {
+		return false;
+	}
+
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+	if (sk != NULL) {
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
+
+		nf_tproxy_put_sock(sk);
+
+		if (wildcard || !transparent)
+			sk = NULL;
+	}
+
+	pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu "
+		 "(orig %pI6:%hu) sock %p\n",
+		 tproto, saddr, ntohs(sport),
+		 daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+	return (sk != NULL);
+}
+#endif
+
 static struct xt_match socket_mt_reg[] __read_mostly = {
 	{
 		.name		= "socket",
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v0,
+		.match		= socket_mt4_v0,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
@@ -199,17 +325,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.name		= "socket",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v1,
+		.match		= socket_mt4_v1,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
 	},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.match		= socket_mt6_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#endif
 };
 
 static int __init socket_mt_init(void)
 {
 	nf_defrag_ipv4_enable();
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	nf_defrag_ipv6_enable();
+#endif
+
 	return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
@@ -225,3 +367,4 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
 MODULE_DESCRIPTION("x_tables socket match module");
 MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
-- 
cgit v1.1


From cc6eb433856983e91071469c4ce57accb6947ccb Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Thu, 21 Oct 2010 16:21:10 +0200
Subject: tproxy: use the interface primary IP address as a default value for
 --on-ip

The REDIRECT target and the older TProxy versions used the primary address
of the incoming interface as the default value of the --on-ip parameter.
This was unintentionally changed during the initial TProxy submission and
caused confusion among users.

Since IPv6 has no notion of primary address, we just select the first address
on the list: this way the socket lookup finds wildcard bound sockets
properly and we cannot really do better without the user telling us the
IPv6 address of the proxy.

This is implemented for both IPv4 and IPv6.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_TPROXY.c | 202 ++++++++++++++++++++++++++++++----------------
 1 file changed, 132 insertions(+), 70 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index d5f97e2..19c482c 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -16,15 +16,41 @@
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
 #include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	rcu_read_lock();
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
 
 /**
  * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
@@ -75,60 +101,6 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 	return sk;
 }
 
-/**
- * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb:	The skb being processed.
- * @tproto:	Transport protocol.
- * @thoff:	Transport protocol header offset.
- * @par:	Iptables target parameters.
- * @sk:		The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
-			 const struct xt_action_param *par,
-			 struct sock *sk)
-{
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct tcphdr _hdr, *hp;
-	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
-	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
-	if (hp == NULL) {
-		inet_twsk_put(inet_twsk(sk));
-		return NULL;
-	}
-
-	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
-		/* SYN to a TIME_WAIT socket, we'd rather redirect it
-		 * to a listener socket if there's one */
-		struct sock *sk2;
-
-		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
-					    &iph->saddr,
-					    !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
-					    hp->source,
-					    tgi->lport ? tgi->lport : hp->dest,
-					    skb->dev, NFT_LOOKUP_LISTENER);
-		if (sk2) {
-			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
-			inet_twsk_put(inet_twsk(sk));
-			sk = sk2;
-		}
-	}
-
-	return sk;
-}
-
 static unsigned int
 tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 	   u_int32_t mark_mask, u_int32_t mark_value)
@@ -150,6 +122,10 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 				   hp->source, hp->dest,
 				   skb->dev, NFT_LOOKUP_ESTABLISHED);
 
+	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	if (!lport)
+		lport = hp->dest;
+
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
 		/* reopening a TIME_WAIT connection needs special handling */
@@ -158,8 +134,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-					   iph->saddr, laddr ? laddr : iph->daddr,
-					   hp->source, lport ? lport : hp->dest,
+					   iph->saddr, laddr,
+					   hp->source, lport,
 					   skb->dev, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
@@ -174,9 +150,9 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 		return NF_ACCEPT;
 	}
 
-	pr_debug("no socket, dropping: proto %hhu %08x:%hu -> %08x:%hu, mark: %x\n",
-		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-		 ntohl(laddr), ntohs(lport), skb->mark);
+	pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+		 iph->protocol, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
 	return NF_DROP;
 }
 
@@ -197,6 +173,88 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+	      const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	rcu_read_lock();
+	indev = __in6_dev_get(skb->dev);
+	if (indev)
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+
+			laddr = &ifa->addr;
+			break;
+		}
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
 static unsigned int
 tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -204,6 +262,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
+	const struct in6_addr *laddr;
+	__be16 lport;
 	int thoff;
 	int tproto;
 
@@ -228,6 +288,9 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 				   hp->source, hp->dest,
 				   par->in, NFT_LOOKUP_ESTABLISHED);
 
+	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	lport = tgi->lport ? tgi->lport : hp->dest;
+
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
 		/* reopening a TIME_WAIT connection needs special handling */
@@ -236,10 +299,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
-					   &iph->saddr,
-					   !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
-					   hp->source,
-					   tgi->lport ? tgi->lport : hp->dest,
+					   &iph->saddr, laddr,
+					   hp->source, lport,
 					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
@@ -249,14 +310,15 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
 
 		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
-			 tproto, &iph->saddr, ntohs(hp->dest),
-			 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+			 tproto, &iph->saddr, ntohs(hp->source),
+			 laddr, ntohs(lport), skb->mark);
 		return NF_ACCEPT;
 	}
 
 	pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
-		 tproto, &iph->saddr, ntohs(hp->dest),
-		 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+		 tproto, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+
 	return NF_DROP;
 }
 
-- 
cgit v1.1