diff options
34 files changed, 1191 insertions, 617 deletions
diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt new file mode 100644 index 0000000..f50d0c6 --- /dev/null +++ b/Documentation/networking/fib_trie.txt @@ -0,0 +1,145 @@ + LC-trie implementation notes. + +Node types +---------- +leaf + An end node with data. This has a copy of the relevant key, along + with 'hlist' with routing table entries sorted by prefix length. + See struct leaf and struct leaf_info. + +trie node or tnode + An internal node, holding an array of child (leaf or tnode) pointers, + indexed through a subset of the key. See Level Compression. + +A few concepts explained +------------------------ +Bits (tnode) + The number of bits in the key segment used for indexing into the + child array - the "child index". See Level Compression. + +Pos (tnode) + The position (in the key) of the key segment used for indexing into + the child array. See Path Compression. + +Path Compression / skipped bits + Any given tnode is linked to from the child array of its parent, using + a segment of the key specified by the parent's "pos" and "bits" + In certain cases, this tnode's own "pos" will not be immediately + adjacent to the parent (pos+bits), but there will be some bits + in the key skipped over because they represent a single path with no + deviations. These "skipped bits" constitute Path Compression. + Note that the search algorithm will simply skip over these bits when + searching, making it necessary to save the keys in the leaves to + verify that they actually do match the key we are searching for. + +Level Compression / child arrays + the trie is kept level balanced moving, under certain conditions, the + children of a full child (see "full_children") up one level, so that + instead of a pure binary tree, each internal node ("tnode") may + contain an arbitrarily large array of links to several children. + Conversely, a tnode with a mostly empty child array (see empty_children) + may be "halved", having some of its children moved downwards one level, + in order to avoid ever-increasing child arrays. + +empty_children + the number of positions in the child array of a given tnode that are + NULL. + +full_children + the number of children of a given tnode that aren't path compressed. + (in other words, they aren't NULL or leaves and their "pos" is equal + to this tnode's "pos"+"bits"). + + (The word "full" here is used more in the sense of "complete" than + as the opposite of "empty", which might be a tad confusing.) + +Comments +--------- + +We have tried to keep the structure of the code as close to fib_hash as +possible to allow verification and help up reviewing. + +fib_find_node() + A good start for understanding this code. This function implements a + straightforward trie lookup. + +fib_insert_node() + Inserts a new leaf node in the trie. This is bit more complicated than + fib_find_node(). Inserting a new node means we might have to run the + level compression algorithm on part of the trie. + +trie_leaf_remove() + Looks up a key, deletes it and runs the level compression algorithm. + +trie_rebalance() + The key function for the dynamic trie after any change in the trie + it is run to optimize and reorganize. Tt will walk the trie upwards + towards the root from a given tnode, doing a resize() at each step + to implement level compression. + +resize() + Analyzes a tnode and optimizes the child array size by either inflating + or shrinking it repeatedly until it fullfills the criteria for optimal + level compression. This part follows the original paper pretty closely + and there may be some room for experimentation here. + +inflate() + Doubles the size of the child array within a tnode. Used by resize(). + +halve() + Halves the size of the child array within a tnode - the inverse of + inflate(). Used by resize(); + +fn_trie_insert(), fn_trie_delete(), fn_trie_select_default() + The route manipulation functions. Should conform pretty closely to the + corresponding functions in fib_hash. + +fn_trie_flush() + This walks the full trie (using nextleaf()) and searches for empty + leaves which have to be removed. + +fn_trie_dump() + Dumps the routing table ordered by prefix length. This is somewhat + slower than the corresponding fib_hash function, as we have to walk the + entire trie for each prefix length. In comparison, fib_hash is organized + as one "zone"/hash per prefix length. + +Locking +------- + +fib_lock is used for an RW-lock in the same way that this is done in fib_hash. +However, the functions are somewhat separated for other possible locking +scenarios. It might conceivably be possible to run trie_rebalance via RCU +to avoid read_lock in the fn_trie_lookup() function. + +Main lookup mechanism +--------------------- +fn_trie_lookup() is the main lookup function. + +The lookup is in its simplest form just like fib_find_node(). We descend the +trie, key segment by key segment, until we find a leaf. check_leaf() does +the fib_semantic_match in the leaf's sorted prefix hlist. + +If we find a match, we are done. + +If we don't find a match, we enter prefix matching mode. The prefix length, +starting out at the same as the key length, is reduced one step at a time, +and we backtrack upwards through the trie trying to find a longest matching +prefix. The goal is always to reach a leaf and get a positive result from the +fib_semantic_match mechanism. + +Inside each tnode, the search for longest matching prefix consists of searching +through the child array, chopping off (zeroing) the least significant "1" of +the child index until we find a match or the child index consists of nothing but +zeros. + +At this point we backtrack (t->stats.backtrack++) up the trie, continuing to +chop off part of the key in order to find the longest matching prefix. + +At this point we will repeatedly descend subtries to look for a match, and there +are some optimizations available that can provide us with "shortcuts" to avoid +descending into dead ends. Look for "HL_OPTIMIZE" sections in the code. + +To alleviate any doubts about the correctness of the route selection process, +a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which +gives userland access to fib_lookup(). diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c index 20edeb3..3ad0b67 100644 --- a/drivers/net/shaper.c +++ b/drivers/net/shaper.c @@ -135,10 +135,8 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct shaper *shaper = dev->priv; struct sk_buff *ptr; - - if (down_trylock(&shaper->sem)) - return -1; - + + spin_lock(&shaper->lock); ptr=shaper->sendq.prev; /* @@ -232,7 +230,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) shaper->stats.collisions++; } shaper_kick(shaper); - up(&shaper->sem); + spin_unlock(&shaper->lock); return 0; } @@ -271,11 +269,9 @@ static void shaper_timer(unsigned long data) { struct shaper *shaper = (struct shaper *)data; - if (!down_trylock(&shaper->sem)) { - shaper_kick(shaper); - up(&shaper->sem); - } else - mod_timer(&shaper->timer, jiffies); + spin_lock(&shaper->lock); + shaper_kick(shaper); + spin_unlock(&shaper->lock); } /* @@ -332,21 +328,6 @@ static void shaper_kick(struct shaper *shaper) /* - * Flush the shaper queues on a closedown - */ - -static void shaper_flush(struct shaper *shaper) -{ - struct sk_buff *skb; - - down(&shaper->sem); - while((skb=skb_dequeue(&shaper->sendq))!=NULL) - dev_kfree_skb(skb); - shaper_kick(shaper); - up(&shaper->sem); -} - -/* * Bring the interface up. We just disallow this until a * bind. */ @@ -375,7 +356,15 @@ static int shaper_open(struct net_device *dev) static int shaper_close(struct net_device *dev) { struct shaper *shaper=dev->priv; - shaper_flush(shaper); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&shaper->sendq)) != NULL) + dev_kfree_skb(skb); + + spin_lock_bh(&shaper->lock); + shaper_kick(shaper); + spin_unlock_bh(&shaper->lock); + del_timer_sync(&shaper->timer); return 0; } @@ -576,6 +565,7 @@ static void shaper_init_priv(struct net_device *dev) init_timer(&sh->timer); sh->timer.function=shaper_timer; sh->timer.data=(unsigned long)sh; + spin_lock_init(&sh->lock); } /* diff --git a/drivers/net/skge.h b/drivers/net/skge.h index 14d0cc0..fced3d2 100644 --- a/drivers/net/skge.h +++ b/drivers/net/skge.h @@ -7,6 +7,7 @@ /* PCI config registers */ #define PCI_DEV_REG1 0x40 #define PCI_DEV_REG2 0x44 +#define PCI_REV_DESC 0x4 #define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \ PCI_STATUS_SIG_SYSTEM_ERROR | \ diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 7e371b1..5464068 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -66,8 +66,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.32" -#define DRV_MODULE_RELDATE "June 24, 2005" +#define DRV_MODULE_VERSION "3.33" +#define DRV_MODULE_RELDATE "July 5, 2005" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 @@ -5117,7 +5117,7 @@ static void tg3_set_bdinfo(struct tg3 *tp, u32 bdinfo_addr, } static void __tg3_set_rx_mode(struct net_device *); -static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) +static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) { tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs); tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs); @@ -5460,7 +5460,7 @@ static int tg3_reset_hw(struct tg3 *tp) udelay(10); } - tg3_set_coalesce(tp, &tp->coal); + __tg3_set_coalesce(tp, &tp->coal); /* set status block DMA address */ tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH, @@ -7821,6 +7821,60 @@ static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) return 0; } +static int tg3_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) +{ + struct tg3 *tp = netdev_priv(dev); + u32 max_rxcoal_tick_int = 0, max_txcoal_tick_int = 0; + u32 max_stat_coal_ticks = 0, min_stat_coal_ticks = 0; + + if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { + max_rxcoal_tick_int = MAX_RXCOAL_TICK_INT; + max_txcoal_tick_int = MAX_TXCOAL_TICK_INT; + max_stat_coal_ticks = MAX_STAT_COAL_TICKS; + min_stat_coal_ticks = MIN_STAT_COAL_TICKS; + } + + if ((ec->rx_coalesce_usecs > MAX_RXCOL_TICKS) || + (ec->tx_coalesce_usecs > MAX_TXCOL_TICKS) || + (ec->rx_max_coalesced_frames > MAX_RXMAX_FRAMES) || + (ec->tx_max_coalesced_frames > MAX_TXMAX_FRAMES) || + (ec->rx_coalesce_usecs_irq > max_rxcoal_tick_int) || + (ec->tx_coalesce_usecs_irq > max_txcoal_tick_int) || + (ec->rx_max_coalesced_frames_irq > MAX_RXCOAL_MAXF_INT) || + (ec->tx_max_coalesced_frames_irq > MAX_TXCOAL_MAXF_INT) || + (ec->stats_block_coalesce_usecs > max_stat_coal_ticks) || + (ec->stats_block_coalesce_usecs < min_stat_coal_ticks)) + return -EINVAL; + + /* No rx interrupts will be generated if both are zero */ + if ((ec->rx_coalesce_usecs == 0) && + (ec->rx_max_coalesced_frames == 0)) + return -EINVAL; + + /* No tx interrupts will be generated if both are zero */ + if ((ec->tx_coalesce_usecs == 0) && + (ec->tx_max_coalesced_frames == 0)) + return -EINVAL; + + /* Only copy relevant parameters, ignore all others. */ + tp->coal.rx_coalesce_usecs = ec->rx_coalesce_usecs; + tp->coal.tx_coalesce_usecs = ec->tx_coalesce_usecs; + tp->coal.rx_max_coalesced_frames = ec->rx_max_coalesced_frames; + tp->coal.tx_max_coalesced_frames = ec->tx_max_coalesced_frames; + tp->coal.rx_coalesce_usecs_irq = ec->rx_coalesce_usecs_irq; + tp->coal.tx_coalesce_usecs_irq = ec->tx_coalesce_usecs_irq; + tp->coal.rx_max_coalesced_frames_irq = ec->rx_max_coalesced_frames_irq; + tp->coal.tx_max_coalesced_frames_irq = ec->tx_max_coalesced_frames_irq; + tp->coal.stats_block_coalesce_usecs = ec->stats_block_coalesce_usecs; + + if (netif_running(dev)) { + tg3_full_lock(tp, 0); + __tg3_set_coalesce(tp, &tp->coal); + tg3_full_unlock(tp); + } + return 0; +} + static struct ethtool_ops tg3_ethtool_ops = { .get_settings = tg3_get_settings, .set_settings = tg3_set_settings, @@ -7856,6 +7910,7 @@ static struct ethtool_ops tg3_ethtool_ops = { .get_stats_count = tg3_get_stats_count, .get_ethtool_stats = tg3_get_ethtool_stats, .get_coalesce = tg3_get_coalesce, + .set_coalesce = tg3_set_coalesce, }; static void __devinit tg3_get_eeprom_size(struct tg3 *tp) @@ -9800,6 +9855,12 @@ static void __devinit tg3_init_coal(struct tg3 *tp) ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS; ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS; } + + if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) { + ec->rx_coalesce_usecs_irq = 0; + ec->tx_coalesce_usecs_irq = 0; + ec->stats_block_coalesce_usecs = 0; + } } static int __devinit tg3_init_one(struct pci_dev *pdev, diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 99c5f96..70ad450 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -879,31 +879,41 @@ #define LOW_RXCOL_TICKS_CLRTCKS 0x00000014 #define DEFAULT_RXCOL_TICKS 0x00000048 #define HIGH_RXCOL_TICKS 0x00000096 +#define MAX_RXCOL_TICKS 0x000003ff #define HOSTCC_TXCOL_TICKS 0x00003c0c #define LOW_TXCOL_TICKS 0x00000096 #define LOW_TXCOL_TICKS_CLRTCKS 0x00000048 #define DEFAULT_TXCOL_TICKS 0x0000012c #define HIGH_TXCOL_TICKS 0x00000145 +#define MAX_TXCOL_TICKS 0x000003ff #define HOSTCC_RXMAX_FRAMES 0x00003c10 #define LOW_RXMAX_FRAMES 0x00000005 #define DEFAULT_RXMAX_FRAMES 0x00000008 #define HIGH_RXMAX_FRAMES 0x00000012 +#define MAX_RXMAX_FRAMES 0x000000ff #define HOSTCC_TXMAX_FRAMES 0x00003c14 #define LOW_TXMAX_FRAMES 0x00000035 #define DEFAULT_TXMAX_FRAMES 0x0000004b #define HIGH_TXMAX_FRAMES 0x00000052 +#define MAX_TXMAX_FRAMES 0x000000ff #define HOSTCC_RXCOAL_TICK_INT 0x00003c18 #define DEFAULT_RXCOAL_TICK_INT 0x00000019 #define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014 +#define MAX_RXCOAL_TICK_INT 0x000003ff #define HOSTCC_TXCOAL_TICK_INT 0x00003c1c #define DEFAULT_TXCOAL_TICK_INT 0x00000019 #define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014 +#define MAX_TXCOAL_TICK_INT 0x000003ff #define HOSTCC_RXCOAL_MAXF_INT 0x00003c20 #define DEFAULT_RXCOAL_MAXF_INT 0x00000005 +#define MAX_RXCOAL_MAXF_INT 0x000000ff #define HOSTCC_TXCOAL_MAXF_INT 0x00003c24 #define DEFAULT_TXCOAL_MAXF_INT 0x00000005 +#define MAX_TXCOAL_MAXF_INT 0x000000ff #define HOSTCC_STAT_COAL_TICKS 0x00003c28 #define DEFAULT_STAT_COAL_TICKS 0x000f4240 +#define MAX_STAT_COAL_TICKS 0xd693d400 +#define MIN_STAT_COAL_TICKS 0x00000064 /* 0x3c2c --> 0x3c30 unused */ #define HOSTCC_STATS_BLK_HOST_ADDR 0x00003c30 /* 64-bit */ #define HOSTCC_STATUS_BLK_HOST_ADDR 0x00003c38 /* 64-bit */ diff --git a/include/linux/if_shaper.h b/include/linux/if_shaper.h index 004e6f0..68c896a 100644 --- a/include/linux/if_shaper.h +++ b/include/linux/if_shaper.h @@ -23,7 +23,7 @@ struct shaper __u32 shapeclock; unsigned long recovery; /* Time we can next clock a packet out on an empty queue */ - struct semaphore sem; + spinlock_t lock; struct net_device_stats stats; struct net_device *dev; int (*hard_start_xmit) (struct sk_buff *skb, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 416a2e4..14b9504 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -183,7 +183,6 @@ struct skb_shared_info { * @priority: Packet queueing priority * @users: User count - see {datagram,tcp}.c * @protocol: Packet protocol from driver - * @security: Security level of packet * @truesize: Buffer size * @head: Head of buffer * @data: Data head pointer @@ -249,18 +248,18 @@ struct sk_buff { data_len, mac_len, csum; - unsigned char local_df, - cloned:1, - nohdr:1, - pkt_type, - ip_summed; __u32 priority; - unsigned short protocol, - security; + __u8 local_df:1, + cloned:1, + ip_summed:2, + nohdr:1; + /* 3 bits spare */ + __u8 pkt_type; + __u16 protocol; void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER - unsigned long nfmark; + unsigned long nfmark; __u32 nfcache; __u32 nfctinfo; struct nf_conntrack *nfct; @@ -1211,7 +1210,7 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, { int hlen = skb_headlen(skb); - if (offset + len <= hlen) + if (hlen - offset >= len) return skb->data + offset; if (skb_copy_bits(skb, offset, buffer, len) < 0) diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h index a6b2cc5..bcb762d 100644 --- a/include/linux/tc_ematch/tc_em_meta.h +++ b/include/linux/tc_ematch/tc_em_meta.h @@ -45,7 +45,7 @@ enum TCF_META_ID_REALDEV, TCF_META_ID_PRIORITY, TCF_META_ID_PROTOCOL, - TCF_META_ID_SECURITY, + TCF_META_ID_SECURITY, /* obsolete */ TCF_META_ID_PKTTYPE, TCF_META_ID_PKTLEN, TCF_META_ID_DATALEN, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index dfd93d0..e4fd82e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -286,7 +286,7 @@ struct tcp_sock { __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ - __u16 mss_cache_std; /* Like mss_cache, but without TSO */ + __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ __u8 ca_state; /* State of fast-retransmit machine */ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index fcb05a3..6492e73 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -13,13 +13,12 @@ struct qdisc_walker extern rwlock_t qdisc_tree_lock; -#define QDISC_ALIGN 32 -#define QDISC_ALIGN_CONST (QDISC_ALIGN - 1) +#define QDISC_ALIGNTO 32 +#define QDISC_ALIGN(len) (((len) + QDISC_ALIGNTO-1) & ~(QDISC_ALIGNTO-1)) static inline void *qdisc_priv(struct Qdisc *q) { - return (char *)q + ((sizeof(struct Qdisc) + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); + return (char *) q + QDISC_ALIGN(sizeof(struct Qdisc)); } /* @@ -207,8 +206,6 @@ psched_tod_diff(int delta_sec, int bound) #endif /* !CONFIG_NET_SCH_CLK_GETTIMEOFDAY */ -extern struct Qdisc noop_qdisc; -extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; @@ -216,14 +213,6 @@ extern int register_qdisc(struct Qdisc_ops *qops); extern int unregister_qdisc(struct Qdisc_ops *qops); extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); -extern void dev_init_scheduler(struct net_device *dev); -extern void dev_shutdown(struct net_device *dev); -extern void dev_activate(struct net_device *dev); -extern void dev_deactivate(struct net_device *dev); -extern void qdisc_reset(struct Qdisc *qdisc); -extern void qdisc_destroy(struct Qdisc *qdisc); -extern struct Qdisc * qdisc_create_dflt(struct net_device *dev, - struct Qdisc_ops *ops); extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab); extern void qdisc_put_rtab(struct qdisc_rate_table *tab); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7b97405..7b6ec99 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -164,6 +164,19 @@ extern void qdisc_unlock_tree(struct net_device *dev); #define tcf_tree_lock(tp) qdisc_lock_tree((tp)->q->dev) #define tcf_tree_unlock(tp) qdisc_unlock_tree((tp)->q->dev) +extern struct Qdisc noop_qdisc; +extern struct Qdisc_ops noop_qdisc_ops; + +extern void dev_init_scheduler(struct net_device *dev); +extern void dev_shutdown(struct net_device *dev); +extern void dev_activate(struct net_device *dev); +extern void dev_deactivate(struct net_device *dev); +extern void qdisc_reset(struct Qdisc *qdisc); +extern void qdisc_destroy(struct Qdisc *qdisc); +extern struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops); +extern struct Qdisc *qdisc_create_dflt(struct net_device *dev, + struct Qdisc_ops *ops); + static inline void tcf_destroy(struct tcf_proto *tp) { diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h index 0b2c278..8716d59 100644 --- a/include/net/slhc_vj.h +++ b/include/net/slhc_vj.h @@ -170,19 +170,14 @@ struct slcompress { }; #define NULLSLCOMPR (struct slcompress *)0 -#define __ARGS(x) x - /* In slhc.c: */ -struct slcompress *slhc_init __ARGS((int rslots, int tslots)); -void slhc_free __ARGS((struct slcompress *comp)); - -int slhc_compress __ARGS((struct slcompress *comp, unsigned char *icp, - int isize, unsigned char *ocp, unsigned char **cpp, - int compress_cid)); -int slhc_uncompress __ARGS((struct slcompress *comp, unsigned char *icp, - int isize)); -int slhc_remember __ARGS((struct slcompress *comp, unsigned char *icp, - int isize)); -int slhc_toss __ARGS((struct slcompress *comp)); +struct slcompress *slhc_init(int rslots, int tslots); +void slhc_free(struct slcompress *comp); + +int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, + unsigned char *ocp, unsigned char **cpp, int compress_cid); +int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize); +int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize); +int slhc_toss(struct slcompress *comp); #endif /* _SLHC_H */ diff --git a/include/net/sock.h b/include/net/sock.h index e593af5..7b76f89 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1134,13 +1134,16 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, int size, int mem, int gfp) { - struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp); + struct sk_buff *skb; + int hdr_len; + hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); + skb = alloc_skb(size + hdr_len, gfp); if (skb) { skb->truesize += mem; if (sk->sk_forward_alloc >= (int)skb->truesize || sk_stream_mem_schedule(sk, skb->truesize, 0)) { - skb_reserve(skb, sk->sk_prot->max_header); + skb_reserve(skb, hdr_len); return skb; } __kfree_skb(skb); diff --git a/include/net/tcp.h b/include/net/tcp.h index ec9e20c..a166918 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -721,11 +721,16 @@ static inline int tcp_ack_scheduled(struct tcp_sock *tp) return tp->ack.pending&TCP_ACK_SCHED; } -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp) +static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) { - if (tp->ack.quick && --tp->ack.quick == 0) { - /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + if (tp->ack.quick) { + if (pkts >= tp->ack.quick) { + tp->ack.quick = 0; + + /* Leaving quickack mode we deflate ATO. */ + tp->ack.ato = TCP_ATO_MIN; + } else + tp->ack.quick -= pkts; } } @@ -843,7 +848,9 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ -extern int tcp_write_xmit(struct sock *, int nonagle); +extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned int cur_mss, int nonagle); +extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); @@ -855,10 +862,13 @@ extern int tcp_write_wakeup(struct sock *); extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, int priority); extern int tcp_send_synack(struct sock *); -extern void tcp_push_one(struct sock *, unsigned mss_now); +extern void tcp_push_one(struct sock *, unsigned int mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); +/* tcp_input.c */ +extern void tcp_cwnd_application_limited(struct sock *sk); + /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); extern void tcp_clear_xmit_timers(struct sock *); @@ -958,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long static inline void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min(tp->advmss, tp->mss_cache_std); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd/2); hint = min(hint, TCP_MIN_RCVMSS); @@ -1225,28 +1235,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -extern void tcp_cwnd_application_limited(struct sock *sk); - -/* Congestion window validation. (RFC2861) */ - -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) -{ - __u32 packets_out = tp->packets_out; - - if (packets_out >= tp->snd_cwnd) { - /* Network is feed fully. */ - tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; - } else { - /* Network starves. */ - if (tp->packets_out > tp->snd_cwnd_used) - tp->snd_cwnd_used = tp->packets_out; - - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) - tcp_cwnd_application_limited(sk); - } -} - /* Set slow start threshould and cwnd not falling to slow start */ static inline void __tcp_enter_cwr(struct tcp_sock *tp) { @@ -1279,12 +1267,6 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) return 3; } -static __inline__ int tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml,tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, const struct sk_buff *skb) { @@ -1292,122 +1274,18 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -/* Return 0, if packet can be sent now without violation Nagle's rules: - 1. It is full sized. - 2. Or it contains FIN. - 3. Or TCP_NODELAY was set. - 4. Or TCP_CORK is not set, and all sent packets are ACKed. - With Minshall's modification: all sent small packets are ACKed. - */ - -static __inline__ int -tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *); - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. - */ -static __inline__ int tcp_snd_test(struct sock *sk, - struct sk_buff *skb, - unsigned cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - int pkts = tcp_skb_pcount(skb); - - if (!pkts) { - tcp_set_skb_tso_segs(sk, skb); - pkts = tcp_skb_pcount(skb); - } - - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * Minshall version sounds: there are no _small_ - * segments in flight. (tcp_nagle_check) - * c) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data (or - * for the final FIN -DaveM). - * - * Also, Nagle rule does not apply to frames, which - * sit in the middle of queue (they have no chances - * to get new data) and if room at tail of skb is - * not enough to save something seriously (<32 for now). - */ - - /* Don't be strict about the congestion window for the - * final FIN frame. -DaveM - */ - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); -} - static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out && !tp->pending) tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); } -static __inline__ int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -static __inline__ void __tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp, - unsigned cur_mss, - int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); -} - static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; diff --git a/net/core/dev.c b/net/core/dev.c index 7016e0c..7f5f62c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; - dev->flags |= IFF_PROMISC; if ((dev->promiscuity += inc) == 0) dev->flags &= ~IFF_PROMISC; - if (dev->flags ^ old_flags) { + else + dev->flags |= IFF_PROMISC; + if (dev->flags != old_flags) { dev_mc_upload(dev); printk(KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : diff --git a/net/core/filter.c b/net/core/filter.c index f3b8820..cd91a24 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -36,7 +36,7 @@ #include <linux/filter.h> /* No hurry in this branch */ -static u8 *load_pointer(struct sk_buff *skb, int k) +static void *__load_pointer(struct sk_buff *skb, int k) { u8 *ptr = NULL; @@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k) return NULL; } +static inline void *load_pointer(struct sk_buff *skb, int k, + unsigned int size, void *buffer) +{ + if (k >= 0) + return skb_header_pointer(skb, k, size, buffer); + else { + if (k >= SKF_AD_OFF) + return NULL; + return __load_pointer(skb, k); + } +} + /** * sk_run_filter - run a filter on a socket * @skb: buffer to run the filter on @@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k) int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - unsigned char *data = skb->data; - /* len is UNSIGNED. Byte wide insns relies only on implicit - type casts to prevent reading arbitrary memory locations. - */ - unsigned int len = skb->len-skb->data_len; struct sock_filter *fentry; /* We walk down these */ + void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + u32 tmp; int k; int pc; @@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) case BPF_LD|BPF_W|BPF_ABS: k = fentry->k; load_w: - if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { - A = ntohl(*(u32*)&data[k]); + ptr = load_pointer(skb, k, 4, &tmp); + if (ptr != NULL) { + A = ntohl(*(u32 *)ptr); continue; } - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = ntohl(*(u32*)ptr); - continue; - } - } else { - u32 _tmp, *p; - p = skb_header_pointer(skb, k, 4, &_tmp); - if (p != NULL) { - A = ntohl(*p); - continue; - } - } return 0; case BPF_LD|BPF_H|BPF_ABS: k = fentry->k; load_h: - if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { - A = ntohs(*(u16*)&data[k]); + ptr = load_pointer(skb, k, 2, &tmp); + if (ptr != NULL) { + A = ntohs(*(u16 *)ptr); continue; } - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = ntohs(*(u16*)ptr); - continue; - } - } else { - u16 _tmp, *p; - p = skb_header_pointer(skb, k, 2, &_tmp); - if (p != NULL) { - A = ntohs(*p); - continue; - } - } return 0; case BPF_LD|BPF_B|BPF_ABS: k = fentry->k; load_b: - if (k >= 0 && (unsigned int)k < len) { - A = data[k]; + ptr = load_pointer(skb, k, 1, &tmp); + if (ptr != NULL) { + A = *(u8 *)ptr; continue; } - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = *ptr; - continue; - } - } else { - u8 _tmp, *p; - p = skb_header_pointer(skb, k, 1, &_tmp); - if (p != NULL) { - A = *p; - continue; - } - } return 0; case BPF_LD|BPF_W|BPF_LEN: - A = len; + A = skb->len; continue; case BPF_LDX|BPF_W|BPF_LEN: - X = len; + X = skb->len; continue; case BPF_LD|BPF_W|BPF_IND: k = X + fentry->k; @@ -259,10 +217,12 @@ load_b: k = X + fentry->k; goto load_b; case BPF_LDX|BPF_B|BPF_MSH: - if (fentry->k >= len) - return 0; - X = (data[fentry->k] & 0xf) << 2; - continue; + ptr = load_pointer(skb, fentry->k, 1, &tmp); + if (ptr != NULL) { + X = (*(u8 *)ptr & 0xf) << 2; + continue; + } + return 0; case BPF_LD|BPF_IMM: A = fentry->k; continue; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb73b21..733deee 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) C(ip_summed); C(priority); C(protocol); - C(security); n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); @@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->pkt_type = old->pkt_type; new->stamp = old->stamp; new->destructor = NULL; - new->security = old->security; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; new->nfcache = old->nfcache; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 9934b25..99bc061 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) if (t < s_t) continue; if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); tb = dn_fib_get_table(t, 0); if (tb == NULL) continue; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 658e797..ef74683 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void) static int ipv4_proc_init(void); extern void ipfrag_init(void); +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = { + .type = __constant_htons(ETH_P_IP), + .func = ip_rcv, +}; + static int __init inet_init(void) { struct sk_buff *dummy_skb; @@ -1102,6 +1111,8 @@ static int __init inet_init(void) ipfrag_init(); + dev_add_pack(&ip_packet_type); + rc = 0; out: return rc; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b56e88e..4be234c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.324" +#define VERSION "0.325" #include <linux/config.h> #include <asm/uaccess.h> @@ -136,6 +136,7 @@ struct trie_use_stats { unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; + unsigned int resize_node_skipped; }; #endif @@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn); -static struct tnode *halve(struct trie *t, struct tnode *tn); +static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); +static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); @@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li) kfree(li); } +static struct tnode *tnode_alloc(unsigned int size) +{ + if (size <= PAGE_SIZE) { + return kmalloc(size, GFP_KERNEL); + } else { + return (struct tnode *) + __get_free_pages(GFP_KERNEL, get_order(size)); + } +} + +static void __tnode_free(struct tnode *tn) +{ + unsigned int size = sizeof(struct tnode) + + (1<<tn->bits) * sizeof(struct node *); + + if (size <= PAGE_SIZE) + kfree(tn); + else + free_pages((unsigned long)tn, get_order(size)); +} + static struct tnode* tnode_new(t_key key, int pos, int bits) { int nchildren = 1<<bits; int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); - struct tnode *tn = kmalloc(sz, GFP_KERNEL); + struct tnode *tn = tnode_alloc(sz); if(tn) { memset(tn, 0, sz); @@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn) printk("FL %p \n", tn); } else if(IS_TNODE(tn)) { - kfree(tn); + __tnode_free(tn); if(trie_debug > 0 ) printk("FT %p \n", tn); } @@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w static struct node *resize(struct trie *t, struct tnode *tn) { int i; + int err = 0; if (!tn) return NULL; @@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn) */ check_tnode(tn); - + + err = 0; while ((tn->full_children > 0 && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold * tnode_child_length(tn))) { - tn = inflate(t, tn); + tn = inflate(t, tn, &err); + + if(err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } } check_tnode(tn); @@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn) * Halve as long as the number of empty children in this * node is above threshold. */ + + err = 0; while (tn->bits > 1 && 100 * (tnode_child_length(tn) - tn->empty_children) < - halve_threshold * tnode_child_length(tn)) + halve_threshold * tnode_child_length(tn)) { + + tn = halve(t, tn, &err); + + if(err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } - tn = halve(t, tn); /* Only one child remains */ @@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) return (struct node *) tn; } -static struct tnode *inflate(struct trie *t, struct tnode *tn) +static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) { struct tnode *inode; struct tnode *oldtnode = tn; @@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) - trie_bug("tnode_new failed"); + if (!tn) { + *err = -ENOMEM; + return oldtnode; + } + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and inflate + * of tnode is ignored. + */ + + for(i = 0; i < olen; i++) { + struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + + if (inode && + IS_TNODE(inode) && + inode->pos == oldtnode->pos + oldtnode->bits && + inode->bits > 1) { + struct tnode *left, *right; + + t_key m = TKEY_GET_MASK(inode->pos, 1); + + left = tnode_new(inode->key&(~m), inode->pos + 1, + inode->bits - 1); + + if(!left) { + *err = -ENOMEM; + break; + } + + right = tnode_new(inode->key|m, inode->pos + 1, + inode->bits - 1); + + if(!right) { + *err = -ENOMEM; + break; + } + + put_child(t, tn, 2*i, (struct node *) left); + put_child(t, tn, 2*i+1, (struct node *) right); + } + } + + if(*err) { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if( tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + *err = -ENOMEM; + return oldtnode; + } for(i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); @@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if(IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, + if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1) == 0) put_child(t, tn, 2*i, node); else @@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) * the position (inode->pos) */ - t_key m = TKEY_GET_MASK(inode->pos, 1); - /* Use the old key, but set the new significant * bit to zero. */ - left = tnode_new(inode->key&(~m), inode->pos + 1, - inode->bits - 1); - if(!left) - trie_bug("tnode_new failed"); - - - /* Use the old key, but set the new significant - * bit to one. - */ - right = tnode_new(inode->key|m, inode->pos + 1, - inode->bits - 1); + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); + + if(!left) + BUG(); + + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); + + if(!right) + BUG(); - if(!right) - trie_bug("tnode_new failed"); - size = tnode_child_length(left); for(j = 0; j < size; j++) { put_child(t, left, j, inode->child[j]); @@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) return tn; } -static struct tnode *halve(struct trie *t, struct tnode *tn) +static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) { struct tnode *oldtnode = tn; struct node *left, *right; @@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); - if(!tn) - trie_bug("tnode_new failed"); + if (!tn) { + *err = -ENOMEM; + return oldtnode; + } + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and halve + * of tnode is ignored. + */ + + for(i = 0; i < olen; i += 2) { + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* Two nonempty children */ + if( left && right) { + struct tnode *newBinNode = + tnode_new(left->key, tn->pos + tn->bits, 1); + + if(!newBinNode) { + *err = -ENOMEM; + break; + } + put_child(t, tn, i/2, (struct node *)newBinNode); + } + } + + if(*err) { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if( tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + *err = -ENOMEM; + return oldtnode; + } for(i = 0; i < olen; i += 2) { left = tnode_get_child(oldtnode, i); @@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) /* Two nonempty children */ else { struct tnode *newBinNode = - tnode_new(left->key, tn->pos + tn->bits, 1); + (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); if(!newBinNode) - trie_bug("tnode_new failed"); + BUG(); put_child(t, newBinNode, 0, left); put_child(t, newBinNode, 1, right); @@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); + seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); #ifdef CLEAR_STATS memset(&(t->stats), 0, sizeof(t->stats)); #endif diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6ce5c32..9de83e6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; @@ -1329,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar ip_rt_put(rt); } -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - void __init ip_init(void) { - dev_add_pack(&ip_packet_type); - ip_rt_init(); inet_initpeers(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 12a1cf3..726ea5e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -54,6 +54,7 @@ * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file + * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -70,6 +71,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; - spinlock_t lock; -} __attribute__((__aligned__(8))); +}; +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks + * The size of this table is a power of two and depends on the number of CPUS. + */ +#if NR_CPUS >= 32 +#define RT_HASH_LOCK_SZ 4096 +#elif NR_CPUS >= 16 +#define RT_HASH_LOCK_SZ 2048 +#elif NR_CPUS >= 8 +#define RT_HASH_LOCK_SZ 1024 +#elif NR_CPUS >= 4 +#define RT_HASH_LOCK_SZ 512 +#else +#define RT_HASH_LOCK_SZ 256 +#endif + +static spinlock_t *rt_hash_locks; +# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] +# define rt_hash_lock_init() { \ + int i; \ + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ + if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ + for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ + spin_lock_init(&rt_hash_locks[i]); \ + } +#else +# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_init() +#endif static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; @@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, /* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { - static int rover; - int i = rover, t; + static unsigned int rover; + unsigned int i = rover, goal; struct rtable *rth, **rthp; unsigned long now = jiffies; - - for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; - t -= ip_rt_gc_timeout) { + u64 mult; + + mult = ((u64)ip_rt_gc_interval) << rt_hash_log; + if (ip_rt_gc_timeout > 1) + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - spin_lock(&rt_hash_table[i].lock); + if (*rthp == 0) + continue; + spin_lock(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ @@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy) rt_free(rth); #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } - spin_unlock(&rt_hash_table[i].lock); + spin_unlock(rt_hash_lock_addr(i)); /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } rover = i; - mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); + mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy) get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(&rt_hash_table[i].lock); + spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - spin_unlock_bh(&rt_hash_table[i].lock); + spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth; rth = next) { next = rth->u.rt_next; @@ -780,7 +818,7 @@ static int rt_garbage_collect(void) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - spin_lock_bh(&rt_hash_table[k].lock); + spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -812,7 +850,7 @@ static int rt_garbage_collect(void) goal--; #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } - spin_unlock_bh(&rt_hash_table[k].lock); + spin_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } @@ -882,7 +920,7 @@ restart: rthp = &rt_hash_table[hash].chain; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (!(rth->u.dst.flags & DST_BALANCED) && @@ -908,7 +946,7 @@ restart: rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); *rp = rth; @@ -949,7 +987,7 @@ restart: if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); @@ -990,7 +1028,7 @@ restart: } #endif rt_hash_table[hash].chain = rt; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; } @@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt) { struct rtable **rthp; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt) rt_free(rt); break; } - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries); int __init ip_rt_init(void) { - int i, order, goal, rc = 0; + int rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE + { + int order; for (order = 0; (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) /* NOTHING */; @@ -3086,6 +3126,7 @@ int __init ip_rt_init(void) if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); memset(ip_rt_acct, 0, PAGE_SIZE << order); + } #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", @@ -3096,36 +3137,19 @@ int __init ip_rt_init(void) if (!ipv4_dst_ops.kmem_cachep) panic("IP: failed to allocate ip_dst_cache\n"); - goal = num_physpages >> (26 - PAGE_SHIFT); - if (rhash_entries) - goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; - for (order = 0; (1UL << order) < goal; order++) - /* NOTHING */; - - do { - rt_hash_mask = (1UL << order) * PAGE_SIZE / - sizeof(struct rt_hash_bucket); - while (rt_hash_mask & (rt_hash_mask - 1)) - rt_hash_mask--; - rt_hash_table = (struct rt_hash_bucket *) - __get_free_pages(GFP_ATOMIC, order); - } while (rt_hash_table == NULL && --order > 0); - - if (!rt_hash_table) - panic("Failed to allocate IP route cache hash table\n"); - - printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", - rt_hash_mask, - (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); - - for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) - /* NOTHING */; - - rt_hash_mask--; - for (i = 0; i <= rt_hash_mask; i++) { - spin_lock_init(&rt_hash_table[i].lock); - rt_hash_table[i].chain = NULL; - } + rt_hash_table = (struct rt_hash_bucket *) + alloc_large_system_hash("IP route cache", + sizeof(struct rt_hash_bucket), + rhash_entries, + (num_physpages >= 128 * 1024) ? + (27 - PAGE_SHIFT) : + (29 - PAGE_SHIFT), + HASH_HIGHMEM, + &rt_hash_log, + &rt_hash_mask, + 0); + memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); ip_rt_max_size = (rt_hash_mask + 1) * 16; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 882436d..29894c7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse size_t psize, int flags) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; + int mss_now, size_goal; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; copied = 0; err = -EPIPE; @@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -652,7 +653,7 @@ new_segment: goto wait_for_memory; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } if (copy > size) @@ -693,7 +694,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -713,6 +714,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } out: @@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - int tmp = tp->mss_cache_std; + int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + if (sk->sk_route_caps & NETIF_F_TSO) + tmp = 0; + else { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; + if (tmp >= pgbreak && + tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + tmp = pgbreak; + } } + return tmp; } @@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now; + int mss_now, size_goal; int err, copied; long timeo; @@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, skb = sk->sk_write_queue.prev; if (!sk->sk_send_head || - (copy = mss_now - skb->len) <= 0) { + (copy = size_goal - skb->len) <= 0) { new_segment: /* Allocate new segment. If the interface is SG, @@ -837,7 +845,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } /* Try to append data to the end of skb. */ @@ -872,11 +880,6 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } else if (page) { - /* If page is cached, align - * offset to L1 cache boundary - */ - off = (off + L1_CACHE_BYTES - 1) & - ~(L1_CACHE_BYTES - 1); if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; @@ -937,7 +940,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -957,6 +960,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } } @@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rto = jiffies_to_usecs(tp->rto); info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); - info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = tp->ack.rcv_mss; info->tcpi_unacked = tp->packets_out; @@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, switch (optname) { case TCP_MAXSEG: - val = tp->mss_cache_std; + val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7bbbbc3..8de2f10 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { - if (tp->mss_cache_std > 1460) + if (tp->mss_cache > 1460) cwnd = 2; else - cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; + tp->mss_cache = tp->mss_cache; } if (!tp->sacked_out) @@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (IsFack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache_std))) { + tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) } } -/* There is one downside to this scheme. Although we keep the - * ACK clock ticking, adjusting packet counters and advancing - * congestion window, we do not liberate socket send buffer - * space. - * - * Mucking with skb->truesize and sk->sk_wmem_alloc et al. - * then making a write space wakeup callback is a possible - * future enhancement. WARNING: it is not trivial to make. - */ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, __u32 now, __s32 *seq_rtt) { @@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt * the other end. */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1) + if (tcp_skb_pcount(skb) > 1 && + after(tp->snd_una, scb->seq)) acked |= tcp_tso_acked(sk, skb, now, &seq_rtt); break; @@ -3308,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } +static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +{ + /* If the user specified a specific send buffer setting, do + * not modify it. + */ + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ + if (tcp_memory_pressure) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ + if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + return 0; + + /* If we filled the congestion window, do not expand. */ + if (tp->packets_out >= tp->snd_cwnd) + return 0; + + return 1; +} /* When incoming ACK allowed to free some skb from write_queue, * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket @@ -3319,11 +3333,8 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out < tp->snd_cwnd && - !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + if (tcp_should_expand_sndbuf(sk, tp)) { + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); @@ -3346,22 +3357,9 @@ static inline void tcp_check_space(struct sock *sk) } } -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - -static __inline__ void tcp_data_snd_check(struct sock *sk) +static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { - struct sk_buff *skb = sk->sk_send_head; - - if (skb != NULL) - __tcp_data_snd_check(sk, skb); + tcp_push_pending_frames(sk, tp); tcp_check_space(sk); } @@ -3655,7 +3653,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -3721,7 +3719,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); if (!tcp_ack_scheduled(tp)) goto no_ack; } @@ -3799,7 +3797,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); return 0; @@ -4109,7 +4107,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } @@ -4300,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebf1123..62f62bb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; tp->ca_ops = &tcp_init_congestion_ops; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e17c24..e041d05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1; * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ -int sysctl_tcp_tso_win_divisor = 8; +int sysctl_tcp_tso_win_divisor = 3; static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) @@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, tp->ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk) +static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { struct tcp_sock *tp = tcp_sk(sk); - tcp_dec_quickack_mode(tp); + tcp_dec_quickack_mode(tp, pkts); tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } @@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tp->af_specific->send_check(sk, th, skb->len, skb); if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); @@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk->sk_send_head = skb; } -static inline void tcp_tso_set_push(struct sk_buff *skb) -{ - /* Force push to be on for any TSO frames to workaround - * problems with busted implementations like Mac OS-X that - * hold off socket receive wakeups until push is seen. - */ - if (tcp_skb_pcount(skb) > 1) - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; -} - -/* Send _single_ skb sitting at the send head. This function requires - * true push pending frames to setup probe timer etc. - */ -void tcp_push_one(struct sock *sk, unsigned cur_mss) +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - sk->sk_send_head = NULL; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); - return; - } - } -} - -void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (skb->len <= tp->mss_cache_std || + if (skb->len <= tp->mss_cache || !(sk->sk_route_caps & NETIF_F_TSO)) { /* Avoid the costly divide in the normal * non-TSO case. @@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } else { unsigned int factor; - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; + factor = skb->len + (tp->mss_cache - 1); + factor /= tp->mss_cache; skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; + skb_shinfo(skb)->tso_size = tp->mss_cache; } } @@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) } /* Link BUFF into the send queue. */ + skb_header_release(buff); __skb_append(skb, buff); return 0; @@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_cache = mss_now; return mss_now; } @@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ - -unsigned int tcp_current_mss(struct sock *sk, int large) +unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - unsigned int do_large, mss_now; + u32 mss_now; + u16 xmit_size_goal; + int doing_tso = 0; + + mss_now = tp->mss_cache; + + if (large_allowed && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode) + doing_tso = 1; - mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - do_large = (large && - (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode); + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - if (do_large) { - unsigned int large_mss, factor, limit; + xmit_size_goal = mss_now; - large_mss = 65535 - tp->af_specific->net_header_len - + if (doing_tso) { + xmit_size_goal = 65535 - + tp->af_specific->net_header_len - tp->ext_header_len - tp->tcp_header_len; - if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), - 68U - tp->tcp_header_len); + if (tp->max_window && + (xmit_size_goal > (tp->max_window >> 1))) + xmit_size_goal = max((tp->max_window >> 1), + 68U - tp->tcp_header_len); + + xmit_size_goal -= (xmit_size_goal % mss_now); + } + tp->xmit_size_goal = xmit_size_goal; - factor = large_mss / mss_now; + return mss_now; +} - /* Always keep large mss multiple of real mss, but - * do not exceed 1/tso_win_divisor of the congestion window - * so we can keep the ACK clock ticking and minimize - * bursting. - */ - limit = tp->snd_cwnd; - if (sysctl_tcp_tso_win_divisor) - limit /= sysctl_tcp_tso_win_divisor; - limit = max(1U, limit); - if (factor > limit) - factor = limit; +/* Congestion window validation. (RFC2861) */ - tp->mss_cache = mss_now * factor; +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out = tp->packets_out; + + if (packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; - mss_now = tp->mss_cache; + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); } +} - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - return mss_now; +static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +{ + u32 window, cwnd_len; + + window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + cwnd_len = mss_now * cwnd; + return min(window, cwnd_len); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { + tcp_set_skb_tso_segs(sk, skb); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u16 flags; + + /* All of a TSO frame must be composed of paged data. */ + BUG_ON(skb->len != skb->data_len); + + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + if (unlikely(buff == NULL)) + return -ENOMEM; + + buff->truesize = nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb); + tcp_set_skb_tso_segs(sk, buff); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + __skb_append(skb, buff); + + return 0; +} + +/* Try to defer sending, if possible, in order to minimize the amount + * of TSO splitting we do. View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + */ +static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 send_win, cong_win, limit, in_flight; + + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 0; + + if (tp->ca_state != TCP_CA_Open) + return 0; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || + (tp->snd_cwnd <= in_flight)); + + send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that cwnd > in_flight. */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If sk_send_head can be sent fully now, just do it. */ + if (skb->len <= limit) + return 0; + + if (sysctl_tcp_tso_win_divisor) { + u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk /= sysctl_tcp_tso_win_divisor; + if (limit >= chunk) + return 0; + } else { + /* Different approach, try not to defer past a single + * ACK. Receiver should ACK every other full sized + * frame, so if we have space for more than 3 frames + * then send now. + */ + if (limit > tcp_max_burst(tp) * tp->mss_cache) + return 0; + } + + /* Ok, it looks like it is advisable to defer. */ + return 1; } /* This routine writes packets to the network. It advances the @@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); - unsigned int mss_now; + struct sk_buff *skb; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *skb; - int sent_pkts = 0; + if (unlikely(sk->sk_state == TCP_CLOSE)) + return 0; + + skb = sk->sk_send_head; + if (unlikely(!skb)) + return 0; + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_cwnd_test(tp, skb); + if (unlikely(!cwnd_quota)) + goto out; + + sent_pkts = 0; + while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) { + BUG_ON(!tso_segs); + + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + if (tcp_tso_should_defer(sk, tp, skb)) + break; + } - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); - - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) break; } - - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) break; + } - /* Advance the send_head. This one is sent out. - * This call will increment packets_out. - */ - update_send_head(sk, tp, skb); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + break; + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); + + tcp_minshall_update(tp, mss_now, skb); + sent_pkts++; + + /* Do not optimize this to use tso_segs. If we chopped up + * the packet above, tso_segs will no longer be valid. + */ + cwnd_quota -= tcp_skb_pcount(skb); + + BUG_ON(cwnd_quota < 0); + if (!cwnd_quota) + break; + + skb = sk->sk_send_head; + if (!skb) + break; + tso_segs = tcp_init_tso_segs(sk, skb); + } + + if (likely(sent_pkts)) { + tcp_cwnd_validate(sk, tp); + return 0; + } +out: + return !tp->packets_out && sk->sk_send_head; +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned int cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; - tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; + if (skb) { + if (tcp_write_xmit(sk, cur_mss, nonagle)) + tcp_check_probe_timer(sk, tp); + } +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + unsigned int tso_segs, cwnd_quota; + + BUG_ON(!skb || skb->len < mss_now); + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); + + if (likely(cwnd_quota)) { + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (unlikely(tso_fragment(sk, skb, limit))) + return; + } + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) + return; } - if (sent_pkts) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); - return 0; + return; } - - return !tp->packets_out && sk->sk_send_head; } - return 0; } /* This function returns the amount that we can raise the @@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_route_caps & NETIF_F_TSO) { sock_set_flag(sk, SOCK_NO_LARGESEND); sk->sk_route_caps &= ~NETIF_F_TSO; - tp->mss_cache = tp->mss_cache_std; } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2b193e3..28d9bca 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -774,7 +774,6 @@ static int __init inet6_init(void) if (if6_proc_init()) goto proc_if6_fail; #endif - ipv6_packet_init(); ip6_route_init(); ip6_flowlabel_init(); err = addrconf_init(); @@ -791,6 +790,8 @@ static int __init inet6_init(void) /* Init v6 transport protocols. */ udpv6_init(); tcpv6_init(); + + ipv6_packet_init(); err = 0; out: return err; @@ -798,7 +799,6 @@ out: addrconf_fail: ip6_flowlabel_cleanup(); ip6_route_cleanup(); - ipv6_packet_cleanup(); #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 06e7cda..1f2c2f9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9dac7fd..f6e288d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; diff --git a/net/sched/Makefile b/net/sched/Makefile index 8f58cec..e48d0d4 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -4,7 +4,7 @@ obj-y := sch_generic.o -obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o obj-$(CONFIG_NET_ACT_POLICE) += police.o diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 48bb23c..53d98f8 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol) dst->value = skb->protocol; } -META_COLLECTOR(int_security) -{ - dst->value = skb->security; -} - META_COLLECTOR(int_pkttype) { dst->value = skb->pkt_type; @@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(REALDEV)] = META_FUNC(int_realdev), [META_ID(PRIORITY)] = META_FUNC(int_priority), [META_ID(PROTOCOL)] = META_FUNC(int_protocol), - [META_ID(SECURITY)] = META_FUNC(int_security), [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), [META_ID(PKTLEN)] = META_FUNC(int_pktlen), [META_ID(DATALEN)] = META_FUNC(int_datalen), diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 05e6e0a..b9a069a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) { int err; struct rtattr *kind = tca[TCA_KIND-1]; - void *p = NULL; struct Qdisc *sch; struct Qdisc_ops *ops; - int size; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_KMOD @@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) if (ops == NULL) goto err_out; - /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); - size += ops->priv_size + QDISC_ALIGN_CONST; - - p = kmalloc(size, GFP_KERNEL); - err = -ENOBUFS; - if (!p) + sch = qdisc_alloc(dev, ops); + if (IS_ERR(sch)) { + err = PTR_ERR(sch); goto err_out2; - memset(p, 0, size); - sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); - sch->padded = (char *)sch - (char *)p; - - INIT_LIST_HEAD(&sch->list); - skb_queue_head_init(&sch->q); + } - if (handle == TC_H_INGRESS) + if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; - - sch->ops = ops; - sch->enqueue = ops->enqueue; - sch->dequeue = ops->dequeue; - sch->dev = dev; - dev_hold(dev); - atomic_set(&sch->refcnt, 1); - sch->stats_lock = &dev->queue_lock; - if (handle == 0) { + handle = TC_H_MAKE(TC_H_INGRESS, 0); + } else if (handle == 0) { handle = qdisc_alloc_handle(dev); err = -ENOMEM; if (handle == 0) goto err_out3; } - if (handle == TC_H_INGRESS) - sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); - else - sch->handle = handle; + sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + err = gen_new_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, + tca[TCA_RATE-1]); + if (err) { + /* + * Any broken qdiscs that would require + * a ops->reset() here? The qdisc was never + * in action so it shouldn't be necessary. + */ + if (ops->destroy) + ops->destroy(sch); + goto err_out3; + } + } +#endif qdisc_lock_tree(dev); list_add_tail(&sch->list, &dev->qdisc_list); qdisc_unlock_tree(dev); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_new_estimator(&sch->bstats, &sch->rate_est, - sch->stats_lock, tca[TCA_RATE-1]); -#endif return sch; } err_out3: dev_put(dev); + kfree((char *) sch - sch->padded); err_out2: module_put(ops->owner); err_out: *errp = err; - if (p) - kfree(p); return NULL; } diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c new file mode 100644 index 0000000..81f0b83 --- /dev/null +++ b/net/sched/sch_blackhole.c @@ -0,0 +1,54 @@ +/* + * net/sched/sch_blackhole.c Black hole queue + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * + * Note: Quantum tunneling is not supported. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + qdisc_drop(skb, sch); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static struct Qdisc_ops blackhole_qdisc_ops = { + .id = "blackhole", + .priv_size = 0, + .enqueue = blackhole_enqueue, + .dequeue = blackhole_dequeue, + .owner = THIS_MODULE, +}; + +static int __init blackhole_module_init(void) +{ + return register_qdisc(&blackhole_qdisc_ops); +} + +static void __exit blackhole_module_exit(void) +{ + unregister_qdisc(&blackhole_qdisc_ops); +} + +module_init(blackhole_module_init) +module_exit(blackhole_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7683b34..73e218e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = { .owner = THIS_MODULE, }; -struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; - int size; + unsigned int size; + int err = -ENOBUFS; /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); - size += ops->priv_size + QDISC_ALIGN_CONST; + size = QDISC_ALIGN(sizeof(*sch)); + size += ops->priv_size + (QDISC_ALIGNTO - 1); p = kmalloc(size, GFP_KERNEL); if (!p) - return NULL; + goto errout; memset(p, 0, size); - - sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); - sch->padded = (char *)sch - (char *)p; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); @@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) dev_hold(dev); sch->stats_lock = &dev->queue_lock; atomic_set(&sch->refcnt, 1); + + return sch; +errout: + return ERR_PTR(-err); +} + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +{ + struct Qdisc *sch; + + sch = qdisc_alloc(dev, ops); + if (IS_ERR(sch)) + goto errout; + if (!ops->init || ops->init(sch, NULL) == 0) return sch; - dev_put(dev); - kfree(p); +errout: return NULL; } @@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up); EXPORT_SYMBOL(noop_qdisc); EXPORT_SYMBOL(noop_qdisc_ops); EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(qdisc_alloc); EXPORT_SYMBOL(qdisc_destroy); EXPORT_SYMBOL(qdisc_reset); EXPORT_SYMBOL(qdisc_restart); |