From 8f49c2703b33519aaaccc63f571b465b9d2b3a2d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 12 Nov 2010 13:35:00 -0800 Subject: tcp: Don't change unlocked socket state in tcp_v4_err(). Alexey Kuznetsov noticed a regression introduced by commit f1ecd5d9e7366609d640ff4040304ea197fbc618 ("Revert Backoff [v3]: Revert RTO on ICMP destination unreachable") The RTO and timer modification code added to tcp_v4_err() doesn't check sock_owned_by_user(), which if true means we don't have exclusive access to the socket and therefore cannot modify it's critical state. Just skip this new code block if sock_owned_by_user() is true and eliminate the now superfluous sock_owned_by_user() code block contained within. Reported-by: Alexey Kuznetsov Signed-off-by: David S. Miller CC: Damian Lukowski Acked-by: Eric Dumazet --- net/ipv4/tcp_ipv4.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8f8527d..69ccbc1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -415,6 +415,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff) break; + if (sock_owned_by_user(sk)) + break; + icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; @@ -429,11 +432,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); - } else if (sock_owned_by_user(sk)) { - /* RTO revert clocked out retransmission, - * but socket is locked. Will defer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - HZ/20, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ -- cgit v1.1 From 8475ef9fd16cadbfc692f78e608d1941a340beb2 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 22 Nov 2010 03:26:12 +0000 Subject: netns: Don't leak others' openreq-s in proc The /proc/net/tcp leaks openreq sockets from other namespaces. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1..e13da6d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2043,7 +2043,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } -- cgit v1.1 From 582a72da9a41be9227dc931d728ae2906880a589 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 30 Nov 2010 11:53:55 -0800 Subject: inetpeer: Introduce inet_peer_address_t. Currently only the v4 aspect is used, but this will change. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1..b8bbf89 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1347,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->v4daddr == saddr) { + peer->daddr.a4 == saddr) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > -- cgit v1.1 From b534ecf1cd26f094497da6ae28a6ab64cdbe1617 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 30 Nov 2010 11:54:19 -0800 Subject: inetpeer: Make inet_getpeer() take an inet_peer_adress_t pointer. And make an inet_getpeer_v4() helper, update callers. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b8bbf89..00285fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1778,7 +1778,7 @@ int tcp_v4_remember_stamp(struct sock *sk) int release_it = 0; if (!rt || rt->rt_dst != inet->inet_daddr) { - peer = inet_getpeer(inet->inet_daddr, 1); + peer = inet_getpeer_v4(inet->inet_daddr, 1); release_it = 1; } else { if (!rt->peer) @@ -1804,7 +1804,7 @@ EXPORT_SYMBOL(tcp_v4_remember_stamp); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1); if (peer) { const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); -- cgit v1.1 From 3f419d2d487821093ee46e898b5f8747f9edc9cd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Nov 2010 13:37:14 -0800 Subject: inet: Turn ->remember_stamp into ->get_peer in connection AF ops. Then we can make a completely generic tcp_remember_stamp() that uses ->get_peer() as a helper, minimizing the AF specific code and minimizing the eventual code duplication when we implement the ipv6 side of TW recycling. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 00285fc..0ddf819 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1763,44 +1763,25 @@ do_time_wait: goto discard_it; } -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -int tcp_v4_remember_stamp(struct sock *sk) +struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) { + struct rtable *rt = (struct rtable *) __sk_dst_get(sk); struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_get(sk); - struct inet_peer *peer = NULL; - int release_it = 0; + struct inet_peer *peer; if (!rt || rt->rt_dst != inet->inet_daddr) { peer = inet_getpeer_v4(inet->inet_daddr, 1); - release_it = 1; + *release_it = true; } else { if (!rt->peer) rt_bind_peer(rt, 1); peer = rt->peer; + *release_it = false; } - if (peer) { - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - peer->tcp_ts = tp->rx_opt.ts_recent; - } - if (release_it) - inet_putpeer(peer); - return 1; - } - - return 0; + return peer; } -EXPORT_SYMBOL(tcp_v4_remember_stamp); +EXPORT_SYMBOL(tcp_v4_get_peer); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { @@ -1828,7 +1809,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, - .remember_stamp = tcp_v4_remember_stamp, + .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, -- cgit v1.1 From ccb7c410ddc054b8c1ae780319bc98ae092d3854 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 1 Dec 2010 18:09:13 -0800 Subject: timewait_sock: Create and use getpeer op. The only thing AF-specific about remembering the timestamp for a time-wait TCP socket is getting the peer. Abstract that behind a new timewait_sock_ops vector. Support for real IPV6 sockets is not filled in yet, but curiously this makes timewait recycling start to work for v4-mapped ipv6 sockets. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0ddf819..dd55505 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1210,12 +1210,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { }; #endif -static struct timewait_sock_ops tcp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor= tcp_twsk_destructor, -}; - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_extend_values tmp_ext; @@ -1783,25 +1777,20 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) } EXPORT_SYMBOL(tcp_v4_get_peer); -int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) +void *tcp_v4_tw_get_peer(struct sock *sk) { - struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1); - - if (peer) { - const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - peer->tcp_ts = tcptw->tw_ts_recent; - } - inet_putpeer(peer); - return 1; - } + struct inet_timewait_sock *tw = inet_twsk(sk); - return 0; + return inet_getpeer_v4(tw->tw_daddr, 1); } +EXPORT_SYMBOL(tcp_v4_tw_get_peer); + +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor= tcp_twsk_destructor, + .twsk_getpeer = tcp_v4_tw_get_peer, +}; const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, -- cgit v1.1 From 0dbaee3b37e118a96bb7b8eb0d9bbaeeb46264be Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 13 Dec 2010 12:52:14 -0800 Subject: net: Abstract default ADVMSS behind an accessor. Make all RTAX_ADVMSS metric accesses go through a new helper function, dst_metric_advmss(). Leave the actual default metric as "zero" in the real metric slot, and compute the actual default value dynamically via a new dst_ops AF specific callback. For stacked IPSEC routes, we use the advmss of the path which preserves existing behavior. Unlike ipv4/ipv6, DecNET ties the advmss to the mtu and thus updates advmss on pmtu updates. This inconsistency in advmss handling results in more raw metric accesses than I wish we ended up with. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4fc3387..f401102 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1436,7 +1436,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; -- cgit v1.1 From 1bde5ac49398a064c753bb490535cfad89e99a5f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Dec 2010 09:32:46 -0800 Subject: tcp: fix listening_get_next() Alexey Vlasov found /proc/net/tcp could sometime loop and display millions of sockets in LISTEN state. In 2.6.29, when we converted TCP hash tables to RCU, we left two sk_next() calls in listening_get_next(). We must instead use sk_nulls_next() to properly detect an end of chain. Reported-by: Alexey Vlasov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e13da6d..d978bb2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2030,7 +2030,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) get_req: req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } - sk = sk_next(st->syn_wait_sk); + sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { @@ -2039,7 +2039,7 @@ get_req: if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_next(sk); + sk = sk_nulls_next(sk); } get_sk: sk_nulls_for_each_from(sk, node) { -- cgit v1.1 From fd0273c5033630b8673554cd39660435d1ab2ac4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2011 14:41:20 -0800 Subject: tcp: fix bug in listening_get_next() commit a8b690f98baf9fb19 (tcp: Fix slowness in read /proc/net/tcp) introduced a bug in handling of SYN_RECV sockets. st->offset represents number of sockets found since beginning of listening_hash[st->bucket]. We should not reset st->offset when iterating through syn_table[st->sbucket], or else if more than ~25 sockets (if PAGE_SIZE=4096) are in SYN_RECV state, we exit from listening_get_next() with a too small st->offset Next time we enter tcp_seek_last_pos(), we are not able to seek past already found sockets. Reported-by: PK CC: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 856f684..02f583b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1994,7 +1994,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur) } req = req->dl_next; } - st->offset = 0; if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) break; get_req: -- cgit v1.1