aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/netfilter/Kconfig8
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c20
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c12
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c3
-rw-r--r--net/ipv4/tcp_output.c233
-rw-r--r--net/ipv4/tcp_vegas.c16
6 files changed, 150 insertions, 142 deletions
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 0bc0052..88a6065 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK
instead of the individual packets.
config IP_NF_CONNTRACK_EVENTS
- bool "Connection tracking events"
- depends on IP_NF_CONNTRACK
+ bool "Connection tracking events (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && IP_NF_CONNTRACK
help
If this option is enabled, the connection tracking code will
provide a notifier chain that can be used by other kernel code
@@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS
IF unsure, say `N'.
config IP_NF_CONNTRACK_NETLINK
- tristate 'Connection tracking netlink interface'
- depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+ tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
+ depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK
depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
help
This option enables support for a netlink-based userspace interface
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 7a4ecdd..84c66db 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data)
return 1;
}
+void ip_conntrack_flush(void)
+{
+ ip_ct_iterate_cleanup(kill_all, NULL);
+}
+
static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
{
if (vmalloced)
@@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
get_order(sizeof(struct list_head) * size));
}
-void ip_conntrack_flush(void)
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
{
+ ip_ct_attach = NULL;
+
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
@@ -1363,7 +1372,7 @@ void ip_conntrack_flush(void)
ip_ct_event_cache_flush();
i_see_dead_people:
- ip_ct_iterate_cleanup(kill_all, NULL);
+ ip_conntrack_flush();
if (atomic_read(&ip_conntrack_count) != 0) {
schedule();
goto i_see_dead_people;
@@ -1371,14 +1380,7 @@ void ip_conntrack_flush(void)
/* wait until all references to ip_conntrack_untracked are dropped */
while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
schedule();
-}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
-{
- ip_ct_attach = NULL;
- ip_conntrack_flush();
kmem_cache_destroy(ip_conntrack_cachep);
kmem_cache_destroy(ip_conntrack_expect_cachep);
free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 3fce91b..91fe8f2 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -503,7 +503,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
}
static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
+ [CTA_PROTO_NUM-1] = sizeof(u_int8_t),
[CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
[CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
[CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
@@ -528,7 +528,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
if (!tb[CTA_PROTO_NUM-1])
return -EINVAL;
- tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+ tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
@@ -728,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
return -ENOENT;
}
}
- if (del_timer(&ct->timeout)) {
- ip_conntrack_put(ct);
+ if (del_timer(&ct->timeout))
ct->timeout.function((unsigned long)ct);
- return 0;
- }
+
ip_conntrack_put(ct);
DEBUGP("leaving\n");
@@ -877,7 +875,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
DEBUGP("NAT status: %lu\n",
status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
- if (ip_nat_initialized(ct, hooknum))
+ if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
return -EEXIST;
ip_nat_setup_info(ct, &range, hooknum);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index aeb7353..e7fa29e 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s,
static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
const struct ip_conntrack *ct)
{
- struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+ struct nfattr *nest_parms;
read_lock_bh(&tcp_lock);
+ nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
&ct->proto.tcp.state);
read_unlock_bh(&tcp_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 029c70d..b7325e0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
- if (skb != NULL) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- int tcp_header_size = tp->tcp_header_len;
- struct tcphdr *th;
- int sysctl_flags;
- int err;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet;
+ struct tcp_sock *tp;
+ struct tcp_skb_cb *tcb;
+ int tcp_header_size;
+ struct tcphdr *th;
+ int sysctl_flags;
+ int err;
+
+ BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+ /* If congestion control is doing timestamping, we must
+ * take such a timestamp before we potentially clone/copy.
+ */
+ if (icsk->icsk_ca_ops->rtt_sample)
+ __net_timestamp(skb);
+
+ if (likely(clone_it)) {
+ if (unlikely(skb_cloned(skb)))
+ skb = pskb_copy(skb, gfp_mask);
+ else
+ skb = skb_clone(skb, gfp_mask);
+ if (unlikely(!skb))
+ return -ENOBUFS;
+ }
- BUG_ON(!tcp_skb_pcount(skb));
+ inet = inet_sk(sk);
+ tp = tcp_sk(sk);
+ tcb = TCP_SKB_CB(skb);
+ tcp_header_size = tp->tcp_header_len;
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4
- /* If congestion control is doing timestamping */
- if (icsk->icsk_ca_ops->rtt_sample)
- __net_timestamp(skb);
-
- sysctl_flags = 0;
- if (tcb->flags & TCPCB_FLAG_SYN) {
- tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
- if(sysctl_tcp_timestamps) {
- tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
- sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
- }
- if(sysctl_tcp_window_scaling) {
- tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
- sysctl_flags |= SYSCTL_FLAG_WSCALE;
- }
- if(sysctl_tcp_sack) {
- sysctl_flags |= SYSCTL_FLAG_SACK;
- if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
- tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
- }
- } else if (tp->rx_opt.eff_sacks) {
- /* A SACK is 2 pad bytes, a 2 byte header, plus
- * 2 32-bit sequence numbers for each SACK block.
- */
- tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+ sysctl_flags = 0;
+ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+ if(sysctl_tcp_timestamps) {
+ tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
}
-
- if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(sk, CA_EVENT_TX_START);
-
- th = (struct tcphdr *) skb_push(skb, tcp_header_size);
- skb->h.th = th;
- skb_set_owner_w(skb, sk);
-
- /* Build TCP header and checksum it. */
- th->source = inet->sport;
- th->dest = inet->dport;
- th->seq = htonl(tcb->seq);
- th->ack_seq = htonl(tp->rcv_nxt);
- *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
- if (tcb->flags & TCPCB_FLAG_SYN) {
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- th->window = htons(tp->rcv_wnd);
- } else {
- th->window = htons(tcp_select_window(sk));
+ if (sysctl_tcp_window_scaling) {
+ tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_WSCALE;
}
- th->check = 0;
- th->urg_ptr = 0;
-
- if (tp->urg_mode &&
- between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
- th->urg_ptr = htons(tp->snd_up-tcb->seq);
- th->urg = 1;
+ if (sysctl_tcp_sack) {
+ sysctl_flags |= SYSCTL_FLAG_SACK;
+ if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+ tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
}
+ } else if (unlikely(tp->rx_opt.eff_sacks)) {
+ /* A SACK is 2 pad bytes, a 2 byte header, plus
+ * 2 32-bit sequence numbers for each SACK block.
+ */
+ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks *
+ TCPOLEN_SACK_PERBLOCK));
+ }
+
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
+
+ th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+ skb->h.th = th;
+ skb_set_owner_w(skb, sk);
+
+ /* Build TCP header and checksum it. */
+ th->source = inet->sport;
+ th->dest = inet->dport;
+ th->seq = htonl(tcb->seq);
+ th->ack_seq = htonl(tp->rcv_nxt);
+ *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
+ tcb->flags);
+
+ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(tp->rcv_wnd);
+ } else {
+ th->window = htons(tcp_select_window(sk));
+ }
+ th->check = 0;
+ th->urg_ptr = 0;
- if (tcb->flags & TCPCB_FLAG_SYN) {
- tcp_syn_build_options((__u32 *)(th + 1),
- tcp_advertise_mss(sk),
- (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
- (sysctl_flags & SYSCTL_FLAG_SACK),
- (sysctl_flags & SYSCTL_FLAG_WSCALE),
- tp->rx_opt.rcv_wscale,
- tcb->when,
- tp->rx_opt.ts_recent);
- } else {
- tcp_build_and_update_options((__u32 *)(th + 1),
- tp, tcb->when);
+ if (unlikely(tp->urg_mode &&
+ between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
+ th->urg_ptr = htons(tp->snd_up-tcb->seq);
+ th->urg = 1;
+ }
- TCP_ECN_send(sk, tp, skb, tcp_header_size);
- }
- tp->af_specific->send_check(sk, th, skb->len, skb);
+ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ tcp_syn_build_options((__u32 *)(th + 1),
+ tcp_advertise_mss(sk),
+ (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+ (sysctl_flags & SYSCTL_FLAG_SACK),
+ (sysctl_flags & SYSCTL_FLAG_WSCALE),
+ tp->rx_opt.rcv_wscale,
+ tcb->when,
+ tp->rx_opt.ts_recent);
+ } else {
+ tcp_build_and_update_options((__u32 *)(th + 1),
+ tp, tcb->when);
+ TCP_ECN_send(sk, tp, skb, tcp_header_size);
+ }
- if (tcb->flags & TCPCB_FLAG_ACK)
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+ tp->af_specific->send_check(sk, th, skb->len, skb);
- if (skb->len != tcp_header_size)
- tcp_event_data_sent(tp, skb, sk);
+ if (likely(tcb->flags & TCPCB_FLAG_ACK))
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
- TCP_INC_STATS(TCP_MIB_OUTSEGS);
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, skb, sk);
- err = tp->af_specific->queue_xmit(skb, 0);
- if (err <= 0)
- return err;
+ TCP_INC_STATS(TCP_MIB_OUTSEGS);
- tcp_enter_cwr(sk);
+ err = tp->af_specific->queue_xmit(skb, 0);
+ if (unlikely(err <= 0))
+ return err;
+
+ tcp_enter_cwr(sk);
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
- /* NET_XMIT_CN is special. It does not guarantee,
- * that this packet is lost. It tells that device
- * is about to start to drop packets or already
- * drops some packets of the same priority and
- * invokes us to send less aggressively.
- */
- return err == NET_XMIT_CN ? 0 : err;
- }
- return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
@@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+ if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
break;
/* Advance the send_head. This one is sent out.
@@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+ if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
update_send_head(sk, tp, skb);
tcp_cwnd_validate(sk, tp);
return;
@@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
- pskb_copy(skb, GFP_ATOMIC):
- skb_clone(skb, GFP_ATOMIC)));
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (err == 0) {
/* Update global TCP statistics. */
@@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (tcp_transmit_skb(sk, skb))
+ if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
}
@@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk)
TCP_ECN_send_synack(tcp_sk(sk), skb);
}
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
/*
@@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk)
__skb_queue_tail(&sk->sk_write_queue, buff);
sk_charge_skb(sk, buff);
tp->packets_out += tcp_skb_pcount(buff);
- tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+ tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
@@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff);
+ tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
}
}
@@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- return tcp_transmit_skb(sk, skb);
+ return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
int tcp_write_wakeup(struct sock *sk)
@@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk)
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err) {
update_send_head(sk, tp, skb);
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index b7d296a..13e7e6e 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
vegas->beg_snd_nxt = tp->snd_nxt;
vegas->beg_snd_cwnd = tp->snd_cwnd;
- /* Take into account the current RTT sample too, to
- * decrease the impact of delayed acks. This double counts
- * this sample since we count it for the next window as well,
- * but that's not too awful, since we're taking the min,
- * rather than averaging.
- */
- tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
-
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
* at least one RTT sample that wasn't from a delayed ACK.
@@ -333,11 +325,11 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
tp->snd_cwnd = tp->snd_cwnd_clamp;
}
- }
- /* Wipe the slate clean for the next RTT. */
- vegas->cntRTT = 0;
- vegas->minRTT = 0x7fffffff;
+ /* Wipe the slate clean for the next RTT. */
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+ }
}
/* Extract info for Tcp socket info provided via netlink. */