Phil CK

Bench strcpy

Last updated on

I had a need to parse a huge JSON file, and I was curious how much performance speed up could be had just checking the first character, and hashing everything instead of using strcmp.

Full code can be found here

Results

Platformstrcmpstrcmp with prefixhash runtimehash ahead of time
1(GCC)13183591838140
2(Clang)327154752640243
3(GCC)4666215112923542
3(Clang)4592199413449509

Times are in cycles using rdtsc counter

The platforms tested are

  1. Linux Ubuntu 18 | Intel(R) Core(TM) i7-8550U CPU @ 1.8GHz
  2. MacOSX 10.14 | Intel(R) Core(TM) i5-5257U @ 2.70GHz
  3. Linux Ubuntu 16 | Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz

Strcpy

A basic strcpy benchmark, sets the baseline.

uint64_t
bench_strcmp() {
        const char **str_it = &strings[0];
        uint64_t start = get_time_rdtsc();

        while(*str_it) {
          if(strcmp(*str_it, search_for) == 0) {
            break;
          }
          ++str_it;
        }

        uint64_t end = get_time_rdtsc();

        printf("Found %s\n", *str_it);

        return end - start;
}

Strcpy Prefix Check

Before running strcpy check the first character, as a runtime solution this is the winner.

int
pre_check(uint8_t *a, uint8_t *b) {
        return *a == *b;
}

uint64_t
bench_strcmp_prefix() {
        const char **str_it = &strings[0];
        uint64_t start = get_time_rdtsc();

        uint8_t *search_int = (uint8_t*)search_for;

        const char *str;
        while(str = *str_it++, str) {
                /* check prefix first */
                if(!pre_check((uint8_t*)*str_it, search_int)) {
                    continue;
                }

                /* compare whole string */
                if(strcmp(*str_it, search_for) == 0){
                        break;
                }
        }

        uint64_t end = get_time_rdtsc();

        printf("Found %s\n", *str_it);

        return end - start;
}

Runtme Hashing

Hashing strings and then comparing the hashes, this doesn’t handle collisions.

uint64_t
hash_str(const char *str) {
        uint64_t hash = 5381;
        int c;
        while(c = *str++, c) {
          hash = ((hash << 5) + hash) + c;
        }

        return hash;
}

uint64_t
bench_hash_rt() {
        const char **str_it = &strings[0];
        uint64_t start = get_time_rdtsc();
        uint64_t search_hash = hash_str(search_for);

        while(*str_it) {
                uint64_t hash = hash_str(*str_it);
                if(hash == search_hash) {
                        break;
                }
                ++str_it;
        }

        uint64_t end = get_time_rdtsc();
        
        printf("Found %s\n", *str_it);
        
        return end - start;
}

Ahead Of Time Hashing

Hashing strings upfront. Predictably this is the fastests.

uint64_t
hash_str(const char *str) {
        uint64_t hash = 5381;
        int c;
        while(c = *str++, c) {
          hash = ((hash << 5) + hash) + c;
        }

        return hash;
}

uint64_t
bench_hash_at() {
        /* build hash table */
        uint64_t hash_arr[(sizeof(strings) / sizeof(strings[0]))];
        int count = (sizeof(strings) / sizeof(strings[0]));
        int i;
        for(i = 0; i < count; ++i) {
                if(strings[i] == NULL) {
                        hash_arr[i] = (uint64_t)-1;
                }
                else {
                        hash_arr[i] = hash_str(strings[i]);
                }
        }

        /* search */
        uint64_t *hash_it = &hash_arr[0];
        uint64_t search_hash = hash_str(search_for);

        uint64_t start = get_time_rdtsc();
        
        while(*hash_it != (uint64_t)-1) {
                if(*hash_it == search_hash) {
                        break;
                }

                ++hash_it;
        }

        uint64_t end = get_time_rdtsc();
        
        printf("Found %s\n", strings[hash_it - &hash_arr[0]]);
        
        return end - start;
}