[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[netCDF #GDV-269599]: remove strlen calls in inner loops of NC_finddim and NC_findvar -- hash version
- Subject: [netCDF #GDV-269599]: remove strlen calls in inner loops of NC_finddim and NC_findvar -- hash version
- Date: Tue, 08 Dec 2009 08:29:57 -0700
Hi Greg,
Thanks for all your work on this enhancement and for providing
the patch.
We have reluctantly decided that it's too late and a little to
risky for us to include this optimization in version 4.1, but
will include it in the snapshots and releases after that.
--Russ
> Attached is another patch that passes the netcdf regression tests, but
> is an improvement over my previous 'lenstr' patch (#TZM-335067).
> Basically, I added a 'hash' field to both NC_var and NC_dim which
> maintains a hash of the string 'name->cp' field. There is then code in
> dim.c and var.c to update the hash field during var and dim creation and
> renaming. The hash value is used in the NC_finddim and NC_findvar inner
> loops instead of doing "strlen((*loc)->name->cp)". For the file I that
> motivated this, I reduced the strlen calls from 476,952,472 to 389,810
> (43.6% of execution time down to 0.25%) and also reduced the strncmp
> calls. To account for possible hash collisions, it still does the
> strncmp call if the hashes match.
>
> I put the hash code in var.c which probably isn't the correct location,
> but it avoided me having to add new files... The hash code I used was
> SuperFastHash from http://www.azillionmonkeys.com/qed/hash.html.
>
> I realize that this patch may not be of general usage and only provides
> a measurable speedup for very large files, but it is also not very
> intrusive. It does increase NC_dim size by a non-negligible amount (3
> size_t + string vs 2 size_t + string).
>
> Anyway, here it is if you want it. Let me know if you want it cleaned
> up or any renamings.
> --Greg
>
> diff -crB netcdf-4.1-beta2/libsrc/dim.c netcdf-4.1-beta2-new/libsrc/dim.c
> *** netcdf-4.1-beta2/libsrc/dim.c 2009-02-20 13:34:55.000000000 -0700
> --- netcdf-4.1-beta2-new/libsrc/dim.c 2009-12-04 10:54:20.000000000 -0700
> ***************
> *** 38,43 ****
> --- 38,44 ----
> return NULL;
>
> dimp->name = name;
> + dimp->hash = SuperFastHash(name->cp, strlen(name->cp));
> dimp->size = 0;
>
> return(dimp);
> ***************
> *** 127,133 ****
> {
>
> int dimid;
> ! size_t slen;
> NC_dim ** loc;
> char *name;
>
> --- 128,134 ----
> {
>
> int dimid;
> ! uint32_t shash;
> NC_dim ** loc;
> char *name;
>
> ***************
> *** 143,153 ****
> name = (char *)utf8proc_NFC((const unsigned char *)uname);
> if(name == NULL)
> return NC_ENOMEM;
> ! slen = strlen(name);
>
> for(; (size_t) dimid < ncap->nelems
> ! && (strlen((*loc)->name->cp) != slen
> ! || strncmp((*loc)->name->cp, name, slen) != 0);
> dimid++, loc++)
> {
> /*EMPTY*/
> --- 144,154 ----
> name = (char *)utf8proc_NFC((const unsigned char *)uname);
> if(name == NULL)
> return NC_ENOMEM;
> ! shash = SuperFastHash(name, strlen(name));
>
> for(; (size_t) dimid < ncap->nelems
> ! && ((*loc)->hash != shash
> ! || strncmp((*loc)->name->cp, name, strlen(name)) != 0);
> dimid++, loc++)
> {
> /*EMPTY*/
> ***************
> *** 524,529 ****
> --- 525,531 ----
> if(newStr == NULL)
> return NC_ENOMEM;
> dimp->name = newStr;
> + dimp->hash = SuperFastHash(newStr->cp, strlen(newStr->cp));
> free_NC_string(old);
> return NC_NOERR;
> }
> ***************
> *** 531,536 ****
> --- 533,539 ----
> /* else, not in define mode */
>
> status = set_NC_string(dimp->name, newname);
> + dimp->hash = SuperFastHash(newname, strlen(newname));
> free(newname);
> if(status != NC_NOERR)
> return status;
> diff -crB netcdf-4.1-beta2/libsrc/nc.h netcdf-4.1-beta2-new/libsrc/nc.h
> *** netcdf-4.1-beta2/libsrc/nc.h 2009-02-24 14:12:40.000000000 -0700
> --- netcdf-4.1-beta2-new/libsrc/nc.h 2009-12-04 10:55:05.000000000 -0700
> ***************
> *** 12,17 ****
> --- 12,18 ----
> #include <config.h>
>
> #include <stddef.h> /* size_t */
> + #include <stdint.h> /* uint32_t */
> #include <sys/types.h> /* off_t */
> #include "netcdf.h"
> #include "ncio.h" /* ncio */
> ***************
> *** 77,82 ****
> --- 78,84 ----
> typedef struct {
> /* all xdr'd */
> NC_string *name;
> + uint32_t hash;
> size_t size;
> } NC_dim;
>
> ***************
> *** 175,180 ****
> --- 177,183 ----
> size_t *dsizes; /* compiled info: the right to left product of shape */
> /* below gets xdr'd */
> NC_string *name;
> + uint32_t hash;
> /* next two: formerly NC_iarray *assoc */ /* user definition */
> size_t ndims; /* assoc->count */
> int *dimids; /* assoc->value */
> ***************
> *** 194,199 ****
> --- 197,205 ----
>
> /* Begin defined in var.c */
>
> + extern uint32_t
> + SuperFastHash (const char * data, int len);
> +
> extern void
> free_NC_var(NC_var *varp);
>
> diff -crB netcdf-4.1-beta2/libsrc/var.c netcdf-4.1-beta2-new/libsrc/var.c
> *** netcdf-4.1-beta2/libsrc/var.c 2009-02-20 13:35:07.000000000 -0700
> --- netcdf-4.1-beta2-new/libsrc/var.c 2009-12-04 10:54:01.000000000 -0700
> ***************
> *** 52,57 ****
> --- 52,58 ----
>
> varp->name = strp;
> varp->ndims = ndims;
> + varp->hash = SuperFastHash(strp->cp, strlen(strp->cp));
>
> if(ndims != 0)
> {
> ***************
> *** 302,308 ****
> NC_findvar(const NC_vararray *ncap, const char *uname, NC_var **varpp)
> {
> NC_var **loc;
> ! size_t slen;
> int varid;
> char *name;
>
> --- 303,309 ----
> NC_findvar(const NC_vararray *ncap, const char *uname, NC_var **varpp)
> {
> NC_var **loc;
> ! uint32_t shash;
> int varid;
> char *name;
>
> ***************
> *** 317,328 ****
> name = (char *)utf8proc_NFC((const unsigned char *)uname);
> if(name == NULL)
> return NC_ENOMEM;
> ! slen = strlen(name);
>
> for(varid = 0; (size_t) varid < ncap->nelems; varid++, loc++)
> {
> ! if(strlen((*loc)->name->cp) == slen &&
> ! strncmp((*loc)->name->cp, name, slen) == 0)
> {
> if(varpp != NULL)
> *varpp = *loc;
> --- 318,329 ----
> name = (char *)utf8proc_NFC((const unsigned char *)uname);
> if(name == NULL)
> return NC_ENOMEM;
> ! shash = SuperFastHash(name, strlen(name));
>
> for(varid = 0; (size_t) varid < ncap->nelems; varid++, loc++)
> {
> ! if((*loc)->hash == shash &&
> ! strncmp((*loc)->name->cp, name, strlen(name)) == 0)
> {
> if(varpp != NULL)
> *varpp = *loc;
> ***************
> *** 817,828 ****
> --- 818,831 ----
> if(newStr == NULL)
> return(-1);
> varp->name = newStr;
> + varp->hash = SuperFastHash(newStr->cp, strlen(newStr->cp));
> free_NC_string(old);
> return NC_NOERR;
> }
>
> /* else, not in define mode */
> status = set_NC_string(varp->name, newname);
> + varp->hash = SuperFastHash(newname, strlen(newname));
> free(newname);
> if(status != NC_NOERR)
> return status;
> ***************
> *** 838,840 ****
> --- 841,900 ----
>
> return NC_NOERR;
> }
> +
> + #include <stdint.h>
> + #undef get16bits
> + #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
> + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
> + #define get16bits(d) (*((const uint16_t *) (d)))
> + #endif
> +
> + #if !defined (get16bits)
> + #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
> + +(uint32_t)(((const uint8_t *)(d))[0]) )
> + #endif
> +
> + uint32_t SuperFastHash (const char * data, int len) {
> + uint32_t hash = len, tmp;
> + int rem;
> +
> + if (len <= 0 || data == 0) return 0;
> +
> + rem = len & 3;
> + len >>= 2;
> +
> + /* Main loop */
> + for (;len > 0; len--) {
> + hash += get16bits (data);
> + tmp = (get16bits (data+2) << 11) ^ hash;
> + hash = (hash << 16) ^ tmp;
> + data += 2*sizeof (uint16_t);
> + hash += hash >> 11;
> + }
> +
> + /* Handle end cases */
> + switch (rem) {
> + case 3: hash += get16bits (data);
> + hash ^= hash << 16;
> + hash ^= data[sizeof (uint16_t)] << 18;
> + hash += hash >> 11;
> + break;
> + case 2: hash += get16bits (data);
> + hash ^= hash << 11;
> + hash += hash >> 17;
> + break;
> + case 1: hash += *data;
> + hash ^= hash << 10;
> + hash += hash >> 1;
> + }
> +
> + /* Force "avalanching" of final 127 bits */
> + hash ^= hash << 3;
> + hash += hash >> 5;
> + hash ^= hash << 4;
> + hash += hash >> 17;
> + hash ^= hash << 25;
> + hash += hash >> 6;
> +
> + return hash;
> + }
>
>
Russ Rew UCAR Unidata Program
address@hidden http://www.unidata.ucar.edu
Ticket Details
===================
Ticket ID: GDV-269599
Department: Support netCDF
Priority: Normal
Status: Closed