9
\$\begingroup\$

Introduction

This is my first attempt to mess with memory allocation using C and create something close to a python dictionary. Coming from a high level programming language like Python or Java, I am not really used to correctly handling low level code. I tried to follow C conventions/style but I can tell I already failed that since I lack the discipline required.

About the Implementation

This is a fixed sized hash-table combined with a linked list. So in case of a collision I just try to append a node at the same index. The methods implemented are :

  • hset: sets key + value
  • hget: gets value based on key
  • hdel: deletes key + value based on key
  • display: displays the whole table (mostly for debugging)

Code

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define HASH_TABLE_SIZE 255
#define DEBUG_HASH_INDEX 0  // for debugging with predictable indexes

typedef struct HashItem HashItem;

struct HashItem {
  char * key;
  char * value;
  HashItem * tail;
};


void freeNode(struct HashItem * node);


void initialize_hash_table(HashItem * hash_table[]) {
  for (int i=0; i<HASH_TABLE_SIZE; i++) {
    hash_table[i] = NULL;
  }
}


unsigned long get_hash_index(char *str) {
  if (DEBUG_HASH_INDEX) return 4;  // https://xkcd.com/221/
  // http://www.cse.yorku.ca/~oz/hash.html
  unsigned long hash = 5381;
  int c;

  while ((c = *str++))
      hash = ((hash << 5) + hash) + c; /* hash * 33 + c */

  return hash % HASH_TABLE_SIZE;
}


char * hget(HashItem * hash_table[], char * key) {
  unsigned long idx = get_hash_index(key);
  struct HashItem * node = hash_table[idx];
  while (node != NULL) {
    if (strcmp(key, node->key) == 0){
      // if it has the same key we update the value
      return node->value;
    }
    node = node->tail;
  }
  return NULL;
}


void Hdel(HashItem * hash_table[], char * key) {
  unsigned long idx = get_hash_index(key);
  struct HashItem * head = hash_table[idx];
  if (strcmp(key, head->key) == 0){
    // if it has the same key we update the value
    if (head->tail != NULL){
      hash_table[idx] =  head->tail;
    } else {
      hash_table[idx] = NULL;
    }
    freeNode(head);
    return;
  }
}


void hset(HashItem * hash_table[], char * key, char * value) {
  unsigned long idx = get_hash_index(key);

  if (hash_table[idx] == NULL) {
    // idx is empty

    struct HashItem * new_pair = (struct HashItem*) malloc(sizeof(HashItem));
    new_pair->key = key;
    new_pair->value = value;
    new_pair->tail = NULL;
    hash_table[idx] = new_pair;
  } else {
    // idx is not empty

    if (strcmp(key, hash_table[idx]->key) == 0){
      // if it has the same key we update the value
      hash_table[idx]->value = value;

    } else {
      // we insert it in the tail
      struct HashItem * node = hash_table[idx];

      while (1) {
        if (node->tail == NULL) {
          struct HashItem * new_pair = (struct HashItem*) malloc(sizeof(HashItem));
          new_pair->key = key;
          new_pair->value = value;
          new_pair->tail = NULL;
          node->tail = new_pair;
          break;
        }
        if (strcmp(key, node->tail->key) == 0) {
          node->tail->value = value;
          break;
        }
        node = node->tail;

      } // end while

    } // end else
  }
}


void display(HashItem * hash_table[]) {
  // displays the key/values alongside their idx
  struct HashItem * node;
  int inner;
  for (int idx = 0; idx < HASH_TABLE_SIZE; idx++) {

    if (hash_table[idx] != NULL) {

      printf("DISP idx: %d/0 | key: %s | value: %s\n", idx, hash_table[idx]->key, hash_table[idx]->value);
      node = hash_table[idx]->tail;
      inner = 1;
      while (node != NULL) {
        printf("DISP idx: %d/%d | key: %s | value: %s\n", idx, inner, node->key, node->value);
        inner++;
        node = node->tail;
      }
    }
  }
}


void freeNode(struct HashItem * node) {
  free(node->key);
  free(node->value);
  free(node);
  node = NULL;
}


void freeInnerNodes(struct HashItem * node) {
  if (node == NULL) return;
  if (node->tail != NULL) freeInnerNodes(node->tail);
  printf("FREE | key: %s | value: %s\n", node->key, node->value);
  freeNode(node);
}


void freeHashTable(HashItem * hash_table[]) {
  for (int idx = 0; idx < HASH_TABLE_SIZE; idx++) {
    if (hash_table[idx] != NULL) {
      freeInnerNodes(hash_table[idx]);
    }
  }
}


static char *rand_string(char *str, size_t size) {
    const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJK...";
    if (size) {
        --size;
        for (size_t n = 0; n < size; n++) {
            int key = rand() % (int) (sizeof charset - 1);
            str[n] = charset[key];
        }
        str[size] = '\0';
    }
    return str;
}


char* rand_string_alloc(size_t size) {
     char *s = malloc(size + 1);
     if (s) {
         rand_string(s, size);
     }
     return s;
}


int main() {
  struct HashItem* HASH_TABLE[HASH_TABLE_SIZE];
  char * randkey = rand_string_alloc(12);
  initialize_hash_table(HASH_TABLE);
  hset(HASH_TABLE, randkey, rand_string_alloc(12));
  for (int i = 0; i < 5; i++) {
    hset(HASH_TABLE, rand_string_alloc(12), rand_string_alloc(12));
  }
  display(HASH_TABLE);
  Hdel(HASH_TABLE, randkey);
  display(HASH_TABLE);
  freeHashTable(HASH_TABLE);
  return 0;
}
\$\endgroup\$
0

1 Answer 1

8
\$\begingroup\$

General

Good work for a C newcomer! Welcome to the club!

Thank you for providing a good test program, and for the macro to exercise the list code - that really helps give confidence in the implementation. The tests do have some limitations, since they only test inserting elements and freeing the entire table; there's no tests of lookup, updating values or removing single entries, for example. Tests of removal should remove elements from beginning, end and interior of lists, and of singleton lists.

The code compiles almost cleanly (just a single warning) and appears not to leak, double-free, or access inaccessible memory:

gcc -std=c17 -fPIC -g -Wall -Wextra -Wwrite-strings -Wno-parentheses -Wpedantic -Warray-bounds      -Wconversion    215494.c    -o 215494
215494.c: In function ‘get_hash_index’:
215494.c:34:35: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
       hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
                                   ^
make TOOL='valgrind --leak-check=full' 215494.run
make[1]: Entering directory '/home/tms/stackexchange/review'
ulimit -v 1048576 -t 60; exec \
valgrind --leak-check=full ./215494   
==17736== Memcheck, a memory error detector
==17736== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==17736== Using Valgrind-3.14.0 and LibVEX; rerun with -h for copyright info
==17736== Command: ./215494
==17736== 
DISP idx: 11/0 | key: sDhHqlcDnv. | value: hDwkwdBpjcw
DISP idx: 69/0 | key: HwkKbzfehqz | value: yr.eJkngluK
DISP idx: 88/0 | key: xgrJHpAmjvc | value: BktdguAmqli
DISP idx: 108/0 | key: jF.rlbJDhhq | value: mKvagbzjeta
DISP idx: 149/0 | key: tJkIsBbrw.m | value: I.muKitmAAo
DISP idx: 235/0 | key: edEhiKsCydl | value: gjHrepwzohI
DISP idx: 11/0 | key: sDhHqlcDnv. | value: hDwkwdBpjcw
DISP idx: 69/0 | key: HwkKbzfehqz | value: yr.eJkngluK
DISP idx: 108/0 | key: jF.rlbJDhhq | value: mKvagbzjeta
DISP idx: 149/0 | key: tJkIsBbrw.m | value: I.muKitmAAo
DISP idx: 235/0 | key: edEhiKsCydl | value: gjHrepwzohI
FREE | key: sDhHqlcDnv. | value: hDwkwdBpjcw
FREE | key: HwkKbzfehqz | value: yr.eJkngluK
FREE | key: jF.rlbJDhhq | value: mKvagbzjeta
FREE | key: tJkIsBbrw.m | value: I.muKitmAAo
FREE | key: edEhiKsCydl | value: gjHrepwzohI
==17736== 
==17736== HEAP SUMMARY:
==17736==     in use at exit: 0 bytes in 0 blocks
==17736==   total heap usage: 19 allocs, 19 frees, 1,324 bytes allocated
==17736== 
==17736== All heap blocks were freed -- no leaks are possible
==17736== 
==17736== For counts of detected and suppressed errors, rerun with: -v
==17736== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

Let's fix the warning:

      hash = hash * 33 + (unsigned)c;

Note that I've eliminated the need for the comment: multiplying by a constant is an operation that compilers are very good at optimising, and you should expect to see the same object code from both versions. It doesn't help performance to write it less clearly.

Private helpers

We should use static for the internal functions that are not part of the public interface. This will be important when we compile the implementation separately so that we can link with other code.

Const correctness

Consider which pointer arguments point to data that should not be modified, and add const where appropriate:

static unsigned long get_hash_index(char const *str) {

(or const char *; the order of the keywords doesn't matter, but the position relative to * does matter).

Allocations

We correctly test the result of malloc() here:

 char *s = malloc(size + 1);
 if (s) {
     rand_string(s, size);
 }
 return s;

But we use it without checking in other locations, such as:

struct HashItem * new_pair = (struct HashItem*) malloc(sizeof(HashItem));
new_pair->key = key;
new_pair->value = value;

That's Undefined behaviour waiting to bite. When we fix that, we'll need to change the function interface so that we can report the failure to the calling code.

BTW, there's no need to cast the result of malloc() like that; pointers to void can be assigned to variables of any pointer type.

I'd recommend re-writing that as

struct HashItem *new_pair = malloc(sizeof *new_pair);
if (!new_pair) {
    return ERROR_VALUE; /* use your preferred error type/value here */
}
new_pair->key = key;
new_pair->value = value;

Avoid while (1)

I think we can refactor hset() to replace the indefinite loop with a definite one, by reversing the sense of the if test:

bool hset(HashItem *hash_table[], char const *key, char const *value)
{
    unsigned long idx = get_hash_index(key);

    /* search the list for matching key */
    for (struct HashItem *node = hash_table[idx];  node;  node = node->tail) {
        if (strcmp(key, node->key) == 0) {
            node->value = value;
            return true;          /* success */
        }
    }

    /* not found - insert at head */
    struct HashItem *new_pair = malloc(sizeof *new_pair);
    if (!new_pair) {
        return false;       /* failed! */
    }

    new_pair->key = key;
    new_pair->value = value;
    new_pair->tail = hash_table[idx];
    hash_table[idx] = new_pair;
    return true;
}

Inserting at the head also eliminates the special case of the empty list - the for loop is empty in that case and we just drop through to the insertion code.

Prefer iteration to recursion

freeInnerNodes() will recurse as deeply as the list is long. We can easily avoid that, by iterating through the list instead:

void freeInnerNodes(struct HashItem * node)
{
    while (node) {
        struct HashItem *next = node->tail;
        freeNode(node);
        node = next;
    }
}

Don't assign to variables going out of scope

Very minor, but the last line of this function is completely pointless:

void freeNode(struct HashItem * node) {
  free(node->key);
  free(node->value);
  free(node);
  node = NULL;
}

No other code can access the variable node, so the assignment achieves nothing.

\$\endgroup\$
2
  • \$\begingroup\$ 1) The difference in the allocations part is that the first piece of code, I copied it of stack overflow, while the second one I wrote it myself :D 2) in order for me to iterate when freeing memory instead of using recursion, I would have to know where the tail starts for each table, so I would have to iterate to get there, then "reverse" iterate to free each node. Wouldn't that be inefficient ? 3) I see you are using gcc with a bunch of flags then ulimit ? and at the end valgrind. I just used gcc hash_table.c . Where can I find information about your way of compiling ? Thanks you \$\endgroup\$ Commented Mar 15, 2019 at 18:58
  • 1
    \$\begingroup\$ I've edited to show how to iterate over the nodes, deleting as we go (the trick is to store the tail pointer before removing the node). The GCC flags are mostly to enable plenty of warnings; they are explained very well in the manual (-g includes debug symbols, so we get line numbers in any backtraces; -fPIC is probably not required). The ulimit is just [my standard way](q/165861) of running code from Stack Exchange (it helps prevent out-of-control test programs getting obstructing $DAY_JOB by competing for resources). \$\endgroup\$ Commented Mar 18, 2019 at 8:52

Not the answer you're looking for? Browse other questions tagged or ask your own question.