diff options
author | Bob Pearson <rpearson@systemfabricworks.com> | 2012-03-23 15:02:24 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-23 16:58:37 -0700 |
commit | 0292c497b6b942557d085b37f888ef5865f67d37 (patch) | |
tree | fe88508060c0ecdf8fb3d5e98ca2c55057cc5068 /lib | |
parent | 324eb0f17d9dcead3c60c133aa244f6b3631fec9 (diff) | |
download | kernel_goldelico_gta04-0292c497b6b942557d085b37f888ef5865f67d37.zip kernel_goldelico_gta04-0292c497b6b942557d085b37f888ef5865f67d37.tar.gz kernel_goldelico_gta04-0292c497b6b942557d085b37f888ef5865f67d37.tar.bz2 |
crc32: optimize loop counter for x86
Add two changes that improve the performance of x86 systems
1. replace main loop with incrementing counter this change improves
the performance of the selftest by about 5-6% on Nehalem CPUs. The
apparent reason is that the compiler can use the loop index to perform
an indexed memory access. This is reported to make the performance of
PowerPC CPUs to get worse.
2. replace the rem_len loop with incrementing counter this change
improves the performance of the selftest, which has more than the usual
number of occurances, by about 1-2% on x86 CPUs. In actual work loads
the length is most often a multiple of 4 bytes and this code does not
get executed as often if at all. Again this change is reported to make
the performance of PowerPC get worse.
[djwong@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com>
Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/crc32.c | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/lib/crc32.c b/lib/crc32.c index 826e163..4eac9c7 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) # endif const u32 *b; size_t rem_len; +# ifdef CONFIG_X86 + size_t i; +# endif const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; u32 q; @@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) # endif b = (const u32 *)buf; +# ifdef CONFIG_X86 + --b; + for (i = 0; i < len; i++) { +# else for (--b; len; --len) { +# endif q = crc ^ *++b; /* use pre increment for speed */ # if CRC_LE_BITS == 32 crc = DO_CRC4; @@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) /* And the last few bytes */ if (len) { u8 *p = (u8 *)(b + 1) - 1; +# ifdef CONFIG_X86 + for (i = 0; i < len; i++) + DO_CRC(*++p); /* use pre increment for speed */ +# else do { DO_CRC(*++p); /* use pre increment for speed */ } while (--len); +# endif } return crc; #undef DO_CRC |