This archive contains answers to questions sent to Unidata support through mid-2025. Note that the archive is no longer being updated. We provide the archive for reference; many of the answers presented here remain technically correct, even if somewhat outdated. For the most up-to-date information on the use of NSF Unidata software and data services, please consult the Software Documentation first.
Hi Greg, Thanks for all your work on this enhancement and for providing the patch. We have reluctantly decided that it's too late and a little to risky for us to include this optimization in version 4.1, but will include it in the snapshots and releases after that. --Russ > Attached is another patch that passes the netcdf regression tests, but > is an improvement over my previous 'lenstr' patch (#TZM-335067). > Basically, I added a 'hash' field to both NC_var and NC_dim which > maintains a hash of the string 'name->cp' field. There is then code in > dim.c and var.c to update the hash field during var and dim creation and > renaming. The hash value is used in the NC_finddim and NC_findvar inner > loops instead of doing "strlen((*loc)->name->cp)". For the file I that > motivated this, I reduced the strlen calls from 476,952,472 to 389,810 > (43.6% of execution time down to 0.25%) and also reduced the strncmp > calls. To account for possible hash collisions, it still does the > strncmp call if the hashes match. > > I put the hash code in var.c which probably isn't the correct location, > but it avoided me having to add new files... The hash code I used was > SuperFastHash from http://www.azillionmonkeys.com/qed/hash.html. > > I realize that this patch may not be of general usage and only provides > a measurable speedup for very large files, but it is also not very > intrusive. It does increase NC_dim size by a non-negligible amount (3 > size_t + string vs 2 size_t + string). > > Anyway, here it is if you want it. Let me know if you want it cleaned > up or any renamings. > --Greg > > diff -crB netcdf-4.1-beta2/libsrc/dim.c netcdf-4.1-beta2-new/libsrc/dim.c > *** netcdf-4.1-beta2/libsrc/dim.c 2009-02-20 13:34:55.000000000 -0700 > --- netcdf-4.1-beta2-new/libsrc/dim.c 2009-12-04 10:54:20.000000000 -0700 > *************** > *** 38,43 **** > --- 38,44 ---- > return NULL; > > dimp->name = name; > + dimp->hash = SuperFastHash(name->cp, strlen(name->cp)); > dimp->size = 0; > > return(dimp); > *************** > *** 127,133 **** > { > > int dimid; > ! size_t slen; > NC_dim ** loc; > char *name; > > --- 128,134 ---- > { > > int dimid; > ! uint32_t shash; > NC_dim ** loc; > char *name; > > *************** > *** 143,153 **** > name = (char *)utf8proc_NFC((const unsigned char *)uname); > if(name == NULL) > return NC_ENOMEM; > ! slen = strlen(name); > > for(; (size_t) dimid < ncap->nelems > ! && (strlen((*loc)->name->cp) != slen > ! || strncmp((*loc)->name->cp, name, slen) != 0); > dimid++, loc++) > { > /*EMPTY*/ > --- 144,154 ---- > name = (char *)utf8proc_NFC((const unsigned char *)uname); > if(name == NULL) > return NC_ENOMEM; > ! shash = SuperFastHash(name, strlen(name)); > > for(; (size_t) dimid < ncap->nelems > ! && ((*loc)->hash != shash > ! || strncmp((*loc)->name->cp, name, strlen(name)) != 0); > dimid++, loc++) > { > /*EMPTY*/ > *************** > *** 524,529 **** > --- 525,531 ---- > if(newStr == NULL) > return NC_ENOMEM; > dimp->name = newStr; > + dimp->hash = SuperFastHash(newStr->cp, strlen(newStr->cp)); > free_NC_string(old); > return NC_NOERR; > } > *************** > *** 531,536 **** > --- 533,539 ---- > /* else, not in define mode */ > > status = set_NC_string(dimp->name, newname); > + dimp->hash = SuperFastHash(newname, strlen(newname)); > free(newname); > if(status != NC_NOERR) > return status; > diff -crB netcdf-4.1-beta2/libsrc/nc.h netcdf-4.1-beta2-new/libsrc/nc.h > *** netcdf-4.1-beta2/libsrc/nc.h 2009-02-24 14:12:40.000000000 -0700 > --- netcdf-4.1-beta2-new/libsrc/nc.h 2009-12-04 10:55:05.000000000 -0700 > *************** > *** 12,17 **** > --- 12,18 ---- > #include <config.h> > > #include <stddef.h> /* size_t */ > + #include <stdint.h> /* uint32_t */ > #include <sys/types.h> /* off_t */ > #include "netcdf.h" > #include "ncio.h" /* ncio */ > *************** > *** 77,82 **** > --- 78,84 ---- > typedef struct { > /* all xdr'd */ > NC_string *name; > + uint32_t hash; > size_t size; > } NC_dim; > > *************** > *** 175,180 **** > --- 177,183 ---- > size_t *dsizes; /* compiled info: the right to left product of shape */ > /* below gets xdr'd */ > NC_string *name; > + uint32_t hash; > /* next two: formerly NC_iarray *assoc */ /* user definition */ > size_t ndims; /* assoc->count */ > int *dimids; /* assoc->value */ > *************** > *** 194,199 **** > --- 197,205 ---- > > /* Begin defined in var.c */ > > + extern uint32_t > + SuperFastHash (const char * data, int len); > + > extern void > free_NC_var(NC_var *varp); > > diff -crB netcdf-4.1-beta2/libsrc/var.c netcdf-4.1-beta2-new/libsrc/var.c > *** netcdf-4.1-beta2/libsrc/var.c 2009-02-20 13:35:07.000000000 -0700 > --- netcdf-4.1-beta2-new/libsrc/var.c 2009-12-04 10:54:01.000000000 -0700 > *************** > *** 52,57 **** > --- 52,58 ---- > > varp->name = strp; > varp->ndims = ndims; > + varp->hash = SuperFastHash(strp->cp, strlen(strp->cp)); > > if(ndims != 0) > { > *************** > *** 302,308 **** > NC_findvar(const NC_vararray *ncap, const char *uname, NC_var **varpp) > { > NC_var **loc; > ! size_t slen; > int varid; > char *name; > > --- 303,309 ---- > NC_findvar(const NC_vararray *ncap, const char *uname, NC_var **varpp) > { > NC_var **loc; > ! uint32_t shash; > int varid; > char *name; > > *************** > *** 317,328 **** > name = (char *)utf8proc_NFC((const unsigned char *)uname); > if(name == NULL) > return NC_ENOMEM; > ! slen = strlen(name); > > for(varid = 0; (size_t) varid < ncap->nelems; varid++, loc++) > { > ! if(strlen((*loc)->name->cp) == slen && > ! strncmp((*loc)->name->cp, name, slen) == 0) > { > if(varpp != NULL) > *varpp = *loc; > --- 318,329 ---- > name = (char *)utf8proc_NFC((const unsigned char *)uname); > if(name == NULL) > return NC_ENOMEM; > ! shash = SuperFastHash(name, strlen(name)); > > for(varid = 0; (size_t) varid < ncap->nelems; varid++, loc++) > { > ! if((*loc)->hash == shash && > ! strncmp((*loc)->name->cp, name, strlen(name)) == 0) > { > if(varpp != NULL) > *varpp = *loc; > *************** > *** 817,828 **** > --- 818,831 ---- > if(newStr == NULL) > return(-1); > varp->name = newStr; > + varp->hash = SuperFastHash(newStr->cp, strlen(newStr->cp)); > free_NC_string(old); > return NC_NOERR; > } > > /* else, not in define mode */ > status = set_NC_string(varp->name, newname); > + varp->hash = SuperFastHash(newname, strlen(newname)); > free(newname); > if(status != NC_NOERR) > return status; > *************** > *** 838,840 **** > --- 841,900 ---- > > return NC_NOERR; > } > + > + #include <stdint.h> > + #undef get16bits > + #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ > + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) > + #define get16bits(d) (*((const uint16_t *) (d))) > + #endif > + > + #if !defined (get16bits) > + #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ > + +(uint32_t)(((const uint8_t *)(d))[0]) ) > + #endif > + > + uint32_t SuperFastHash (const char * data, int len) { > + uint32_t hash = len, tmp; > + int rem; > + > + if (len <= 0 || data == 0) return 0; > + > + rem = len & 3; > + len >>= 2; > + > + /* Main loop */ > + for (;len > 0; len--) { > + hash += get16bits (data); > + tmp = (get16bits (data+2) << 11) ^ hash; > + hash = (hash << 16) ^ tmp; > + data += 2*sizeof (uint16_t); > + hash += hash >> 11; > + } > + > + /* Handle end cases */ > + switch (rem) { > + case 3: hash += get16bits (data); > + hash ^= hash << 16; > + hash ^= data[sizeof (uint16_t)] << 18; > + hash += hash >> 11; > + break; > + case 2: hash += get16bits (data); > + hash ^= hash << 11; > + hash += hash >> 17; > + break; > + case 1: hash += *data; > + hash ^= hash << 10; > + hash += hash >> 1; > + } > + > + /* Force "avalanching" of final 127 bits */ > + hash ^= hash << 3; > + hash += hash >> 5; > + hash ^= hash << 4; > + hash += hash >> 17; > + hash ^= hash << 25; > + hash += hash >> 6; > + > + return hash; > + } > > Russ Rew UCAR Unidata Program address@hidden http://www.unidata.ucar.edu Ticket Details =================== Ticket ID: GDV-269599 Department: Support netCDF Priority: Normal Status: Closed