Skip to content

Commit

Permalink
Merge pull request #1487 from ampli/tracon-set
Browse files Browse the repository at this point in the history
Solves issue #1479 by fixing the connector/tracon hashing
  • Loading branch information
linas authored Apr 16, 2024
2 parents b60bb59 + 82cfb94 commit 5d8fa72
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 81 deletions.
2 changes: 1 addition & 1 deletion link-grammar/connectors.c
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ static bool condesc_grow(ConTable *ct)

condesc_t *condesc_add(ConTable *ct, const char *constring)
{
uint32_t hash = (connector_hash_t)connector_str_hash(constring);
uint32_t hash = (connector_uc_hash_t)connector_str_hash(constring);
hdesc_t *h = condesc_find(ct, constring, hash);

if (NULL == h->desc)
Expand Down
29 changes: 26 additions & 3 deletions link-grammar/connectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

typedef uint64_t lc_enc_t;

typedef uint32_t connector_hash_t;
typedef uint32_t connector_uc_hash_t;

#define CD_HEAD_DEPENDENT (1<<0) /* Has a leading 'h' or 'd'. */
#define CD_HEAD (1<<1) /* 0: dependent; 1: head; */
Expand Down Expand Up @@ -85,7 +85,7 @@ struct condesc_struct

const char *string; /* The connector name w/o the direction mark, e.g. AB */
// float *cost; /* Array of cost by connector length (cost[0]: default) */
connector_hash_t uc_num; /* uc part enumeration. */
connector_uc_hash_t uc_num; /* uc part enumeration. */
uint8_t length_limit; /* If not 0, it gives the limit of the length of the
* link that can be used on this connector type. The
* value UNLIMITED_LEN specifies no limit.
Expand All @@ -111,7 +111,7 @@ typedef struct length_limit_def
typedef struct hdesc
{
condesc_t *desc;
connector_hash_t str_hash;
connector_uc_hash_t str_hash;
} hdesc_t;

typedef struct
Expand Down Expand Up @@ -309,6 +309,29 @@ static inline uint32_t string_hash(const char *s)
return i;
}

typedef uint32_t connector_hash_t;

static inline connector_hash_t connector_hash(const Connector *c)
{
return c->desc->uc_num +
(c->multi << 19) +
(((connector_hash_t)c->desc->lc_mask & 1) << 20) +
(connector_hash_t)c->desc->lc_letters;
}

/**
* \p c is assumed to be non-NULL.
*/
static inline connector_hash_t connector_list_hash(const Connector *c)
{
connector_hash_t accum = connector_hash(c);

for (c = c->next; c != NULL; c = c->next)
accum = (19 * accum) + connector_hash(c);

return accum;
}

/**
* Hash function for the classic parser linkage memoization.
*/
Expand Down
16 changes: 7 additions & 9 deletions link-grammar/disjunct-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,17 +236,15 @@ struct disjunct_dup_table_s
static inline unsigned int old_hash_disjunct(disjunct_dup_table *dt,
Disjunct * d, bool string_too)
{
unsigned int i;
i = 0;
for (Connector *e = d->left; e != NULL; e = e->next) {
i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7;
}
for (Connector *e = d->right; e != NULL; e = e->next) {
i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7;
}
unsigned int i = 0;

if (NULL != d->left)
i = connector_list_hash(d->left);
if (NULL != d->right)
i += 19 * connector_list_hash(d->right);
if (string_too)
i += string_hash(d->word_string);
i += (i>>10);
//i += (i>>10);

d->dup_hash = i;
return (i & (dt->dup_table_size-1));
Expand Down
2 changes: 1 addition & 1 deletion link-grammar/parse/prune.c
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ static void power_table_init(Sentence sent, Tracon_sharing *ts, power_table *pt)
static void clean_table(unsigned int size, C_list **t)
{
/* Table entry tombstone. */
#define UC_NUM_TOMBSTONE ((connector_hash_t)-1)
#define UC_NUM_TOMBSTONE ((connector_uc_hash_t)-1)
static condesc_t desc_no_match =
{
.string = "TOMBSTONE",
Expand Down
160 changes: 97 additions & 63 deletions link-grammar/tracon-set.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
/* */
/*************************************************************************/

#ifdef DEBUG
#include <inttypes.h> // format macros
#endif
#define D_TRACON_SET 8 // Debug level for this file

#include "const-prime.h"
#include "connectors.h"
#ifdef TRACON_SET_DEBUG
#include "disjunct-utils.h" // print_connector_list_str
#endif
#include "tracon-set.h"
#include "utilities.h"

Expand Down Expand Up @@ -46,66 +47,99 @@
* (only, their value remains intact).
*/

static unsigned int hash_connectors(const Connector *c, unsigned int shallow)
static tid_hash_t hash_connectors(const Connector *c, unsigned int shallow)
{
unsigned int accum = shallow && c->shallow;

for (; c != NULL; c = c->next)
{
accum = (19 * accum) +
c->desc->uc_num +
(((unsigned int)c->multi)<<20) +
(((unsigned int)c->desc->lc_letters)<<22);
}
tid_hash_t accum = (shallow && c->shallow) ? 1000003 : 0;

return accum;
return accum + connector_list_hash(c);
}

#if 0
/**
* @count Expected number of table elements.
* @return Prime number to use
*
* This function was used for shrinking the table in a try to cause less
* cache/swap trashing if the table temporary grows very big. However, it
* had a bug, and it is not clear when to shrink the table - shrinking it
* unnecessarily can cause an overhead of a table growth. Keep for
* possible reimlementation of a similar idea.
*/
static unsigned int find_prime_for(size_t count)
{
size_t i;
for (i = 0; i < MAX_S_PRIMES; i ++)
if (count < MAX_TRACON_SET_TABLE_SIZE(s_prime[i])) return i;

assert(0, "%zu: Absurdly big count", count);
return 0;
lgdebug(+0, "Warning: %zu: Absurdly big count", count);
return -1;
}
#endif

void tracon_set_reset(Tracon_set *ss)
#ifdef TRACON_SET_DEBUG
static void tracon_set_print(Tracon_set *ss)
{
size_t ncount = MAX(ss->count, ss->ocount);

/* Table sizing heuristic: The number of tracons as a function of
* word number is usually first increasing and then decreasing.
* Continue the trend of the last 2 words. */
if (ss->count > ss->ocount)
ncount = ncount * 3 / 4;
else
ncount = ncount * 4 / 3;
unsigned int prime_idx = find_prime_for(ncount);
if (prime_idx < ss->prime_idx) ss->prime_idx = prime_idx;
if (test_enabled("tracon-set-print"))
{
clist_slot *t;

ss->size = s_prime[ss->prime_idx];
ss->mod_func = prime_mod_func[ss->prime_idx];
printf("tracon_set_print %p:\n", ss);
for (size_t i = 0; i < ss->size; i++)
{
t = &ss->table[i];
if (0 == t->hash) continue;
tid_hash_t x = ss->mod_func(t->hash);
char *cstr = print_connector_list_str(t->clist, 0);
printf("[%zu]: h %zu pri %u sec %u %c %s\n", i, (size_t)t->hash, t->pri_collN,
t->sec_collN, "yn"[i == x], cstr);
free(cstr);
}
}
}

static void tracon_set_stats(Tracon_set *a, Tracon_set *ss, const char *where)
{
lgdebug(+D_TRACON_SET,
"%p: %s: reset %u prime_idx %u acc %zu used %2.2f%% "
"coll/acc %.4f chain %.4f\n",
a, where, ss->resetN, ss->prime_idx, ss->addN,
100.f * ((int)MAX_TRACON_SET_TABLE_SIZE(ss->size) - ss->available_count) / ss->size,
1.* ss->pri_collN/ss->addN,
1. * (ss->pri_collN + ss->sec_collN) / ss->addN);
}
#else
static void tracon_set_print(Tracon_set *ss){};
static void tracon_set_stats(Tracon_set *a, Tracon_set *ss, const char *where){};
#endif

void tracon_set_reset(Tracon_set *ss)
{
#ifdef TRACON_SET_DEBUG
ss->resetN++;
#endif
tracon_set_stats(ss, ss, "reset");
tracon_set_print(ss);
memset(ss->table, 0, ss->size * sizeof(clist_slot));
ss->ocount = ss->count;
ss->count = 0;
ss->available_count = MAX_TRACON_SET_TABLE_SIZE(ss->size);
}

Tracon_set *tracon_set_create(void)
{
Tracon_set *ss = (Tracon_set *) malloc(sizeof(Tracon_set));

ss->prime_idx = 0;
memset(ss, 0, sizeof(Tracon_set));
// ss->prime_idx = 0;
ss->size = s_prime[ss->prime_idx];
ss->mod_func = prime_mod_func[ss->prime_idx];
ss->table = (clist_slot *) malloc(ss->size * sizeof(clist_slot));
memset(ss->table, 0, ss->size * sizeof(clist_slot));
ss->count = ss->ocount = 0;
ss->shallow = false;
ss->available_count = MAX_TRACON_SET_TABLE_SIZE(ss->size);

#ifdef TRACON_SET_DEBUG
lgdebug(+D_TRACON_SET, "%p: prime_idx %u available_count %zu\n",
ss, ss->prime_idx, ss->available_count);
#endif

return ss;
}

Expand All @@ -128,21 +162,8 @@ static bool connector_list_equal(const Connector *c1, const Connector *c2)
return (c1 == NULL) && (c2 == NULL);
}

#if defined DEBUG || defined TRACON_SET_DEBUG
uint64_t fp_count;
uint64_t coll_count;
static void prt_stat(void)
{
lgdebug(+5, "tracon_set: %"PRIu64" accesses, chain %.4f\n",
fp_count, 1.*(fp_count+coll_count)/fp_count);
}
#define PRT_STAT(...) __VA_ARGS__
#else
#define PRT_STAT(...)
#endif

static bool place_found(const Connector *c, const clist_slot *slot,
unsigned int hash, Tracon_set *ss)
tid_hash_t hash, Tracon_set *ss)
{
if (slot->clist == NULL) return true;
if (hash != slot->hash) return false;
Expand All @@ -155,17 +176,27 @@ static bool place_found(const Connector *c, const clist_slot *slot,
* lookup the given string in the table. Return an index
* to the place it is, or the place where it should be.
*/
static unsigned int find_place(const Connector *c, unsigned int h,
Tracon_set *ss)
static tid_hash_t find_place(const Connector *c, tid_hash_t h,
Tracon_set *ss)
{
PRT_STAT(if (fp_count == 0) atexit(prt_stat); fp_count++;)
unsigned int coll_num = 0;
unsigned int key = ss->mod_func(h);
tid_hash_t key = ss->mod_func(h);

/* Quadratic probing. */
while (!place_found(c, &ss->table[key], h, ss))
{
PRT_STAT(coll_count++;)
#ifdef TRACON_SET_DEBUG
if (0 == coll_num)
{
ss->pri_collN++;
ss->table[key].pri_collN++;
}
else
{
ss->sec_collN++;
ss->table[key].sec_collN++;
}
#endif
key += 2 * ++coll_num - 1;
if (key >= ss->size) key = ss->mod_func(key);
}
Expand All @@ -177,7 +208,7 @@ static void grow_table(Tracon_set *ss)
{
Tracon_set old = *ss;

PRT_STAT(uint64_t fp_count_save = fp_count;)
tracon_set_stats(ss, &old, "before grow");
ss->prime_idx++;
ss->size = s_prime[ss->prime_idx];
ss->mod_func = prime_mod_func[ss->prime_idx];
Expand All @@ -187,15 +218,14 @@ static void grow_table(Tracon_set *ss)
{
if (old.table[i].clist != NULL)
{
unsigned int p = find_place(old.table[i].clist, old.table[i].hash, ss);
tid_hash_t p = find_place(old.table[i].clist, old.table[i].hash, ss);
ss->table[p] = old.table[i];
}
}
ss->available_count = MAX_STRING_SET_TABLE_SIZE(ss->size) -
MAX_STRING_SET_TABLE_SIZE(old.size);

/* printf("growing from %zu to %zu\n", old.size, ss->size); */
PRT_STAT(fp_count = fp_count_save);
tracon_set_stats(ss, ss, "after grow");
free(old.table);
}

Expand All @@ -207,34 +237,38 @@ void tracon_set_shallow(bool shallow, Tracon_set *ss)
Connector **tracon_set_add(Connector *clist, Tracon_set *ss)
{
assert(clist != NULL, "Can't insert a null list");
#ifdef TRACON_SET_DEBUG
ss->addN++;
#endif

/* We may need to add it to the table. If the table got too big,
* first we grow it. */
if (ss->available_count == 0) grow_table(ss);

unsigned int h = hash_connectors(clist, ss->shallow);
unsigned int p = find_place(clist, h, ss);
tid_hash_t h = hash_connectors(clist, ss->shallow);
tid_hash_t p = find_place(clist, h, ss);

if (ss->table[p].clist != NULL)
return &ss->table[p].clist;

ss->table[p].hash = h;
ss->count++;
ss->available_count--;

return &ss->table[p].clist;
}

Connector *tracon_set_lookup(const Connector *clist, Tracon_set *ss)
{
unsigned int h = hash_connectors(clist, ss->shallow);
unsigned int p = find_place(clist, h, ss);
tid_hash_t h = hash_connectors(clist, ss->shallow);
tid_hash_t p = find_place(clist, h, ss);
return ss->table[p].clist;
}

void tracon_set_delete(Tracon_set *ss)
{
if (ss == NULL) return;
tracon_set_stats(ss, ss, "delete");
tracon_set_print(ss);
free(ss->table);
free(ss);
}
Loading

0 comments on commit 5d8fa72

Please sign in to comment.