diff --git a/link-grammar/connectors.c b/link-grammar/connectors.c index d66a35163..ed8852042 100644 --- a/link-grammar/connectors.c +++ b/link-grammar/connectors.c @@ -541,7 +541,7 @@ static bool condesc_grow(ConTable *ct) condesc_t *condesc_add(ConTable *ct, const char *constring) { - uint32_t hash = (connector_hash_t)connector_str_hash(constring); + uint32_t hash = (connector_uc_hash_t)connector_str_hash(constring); hdesc_t *h = condesc_find(ct, constring, hash); if (NULL == h->desc) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index 07db14509..365830716 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -50,7 +50,7 @@ typedef uint64_t lc_enc_t; -typedef uint32_t connector_hash_t; +typedef uint32_t connector_uc_hash_t; #define CD_HEAD_DEPENDENT (1<<0) /* Has a leading 'h' or 'd'. */ #define CD_HEAD (1<<1) /* 0: dependent; 1: head; */ @@ -85,7 +85,7 @@ struct condesc_struct const char *string; /* The connector name w/o the direction mark, e.g. AB */ // float *cost; /* Array of cost by connector length (cost[0]: default) */ - connector_hash_t uc_num; /* uc part enumeration. */ + connector_uc_hash_t uc_num; /* uc part enumeration. */ uint8_t length_limit; /* If not 0, it gives the limit of the length of the * link that can be used on this connector type. The * value UNLIMITED_LEN specifies no limit. @@ -111,7 +111,7 @@ typedef struct length_limit_def typedef struct hdesc { condesc_t *desc; - connector_hash_t str_hash; + connector_uc_hash_t str_hash; } hdesc_t; typedef struct @@ -309,6 +309,29 @@ static inline uint32_t string_hash(const char *s) return i; } +typedef uint32_t connector_hash_t; + +static inline connector_hash_t connector_hash(const Connector *c) +{ + return c->desc->uc_num + + (c->multi << 19) + + (((connector_hash_t)c->desc->lc_mask & 1) << 20) + + (connector_hash_t)c->desc->lc_letters; +} + +/** + * \p c is assumed to be non-NULL. + */ +static inline connector_hash_t connector_list_hash(const Connector *c) +{ + connector_hash_t accum = connector_hash(c); + + for (c = c->next; c != NULL; c = c->next) + accum = (19 * accum) + connector_hash(c); + + return accum; +} + /** * Hash function for the classic parser linkage memoization. */ diff --git a/link-grammar/disjunct-utils.c b/link-grammar/disjunct-utils.c index f794448d2..98b629c05 100644 --- a/link-grammar/disjunct-utils.c +++ b/link-grammar/disjunct-utils.c @@ -236,17 +236,15 @@ struct disjunct_dup_table_s static inline unsigned int old_hash_disjunct(disjunct_dup_table *dt, Disjunct * d, bool string_too) { - unsigned int i; - i = 0; - for (Connector *e = d->left; e != NULL; e = e->next) { - i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7; - } - for (Connector *e = d->right; e != NULL; e = e->next) { - i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7; - } + unsigned int i = 0; + + if (NULL != d->left) + i = connector_list_hash(d->left); + if (NULL != d->right) + i += 19 * connector_list_hash(d->right); if (string_too) i += string_hash(d->word_string); - i += (i>>10); + //i += (i>>10); d->dup_hash = i; return (i & (dt->dup_table_size-1)); diff --git a/link-grammar/parse/prune.c b/link-grammar/parse/prune.c index b9aae7346..1ec8e9950 100644 --- a/link-grammar/parse/prune.c +++ b/link-grammar/parse/prune.c @@ -447,7 +447,7 @@ static void power_table_init(Sentence sent, Tracon_sharing *ts, power_table *pt) static void clean_table(unsigned int size, C_list **t) { /* Table entry tombstone. */ -#define UC_NUM_TOMBSTONE ((connector_hash_t)-1) +#define UC_NUM_TOMBSTONE ((connector_uc_hash_t)-1) static condesc_t desc_no_match = { .string = "TOMBSTONE", diff --git a/link-grammar/tracon-set.c b/link-grammar/tracon-set.c index 6527882f5..0847c5722 100644 --- a/link-grammar/tracon-set.c +++ b/link-grammar/tracon-set.c @@ -11,12 +11,13 @@ /* */ /*************************************************************************/ -#ifdef DEBUG -#include // format macros -#endif +#define D_TRACON_SET 8 // Debug level for this file #include "const-prime.h" #include "connectors.h" +#ifdef TRACON_SET_DEBUG +#include "disjunct-utils.h" // print_connector_list_str +#endif #include "tracon-set.h" #include "utilities.h" @@ -46,50 +47,79 @@ * (only, their value remains intact). */ -static unsigned int hash_connectors(const Connector *c, unsigned int shallow) +static tid_hash_t hash_connectors(const Connector *c, unsigned int shallow) { - unsigned int accum = shallow && c->shallow; - - for (; c != NULL; c = c->next) - { - accum = (19 * accum) + - c->desc->uc_num + - (((unsigned int)c->multi)<<20) + - (((unsigned int)c->desc->lc_letters)<<22); - } + tid_hash_t accum = (shallow && c->shallow) ? 1000003 : 0; - return accum; + return accum + connector_list_hash(c); } +#if 0 +/** + * @count Expected number of table elements. + * @return Prime number to use + * + * This function was used for shrinking the table in a try to cause less + * cache/swap trashing if the table temporary grows very big. However, it + * had a bug, and it is not clear when to shrink the table - shrinking it + * unnecessarily can cause an overhead of a table growth. Keep for + * possible reimlementation of a similar idea. + */ static unsigned int find_prime_for(size_t count) { size_t i; for (i = 0; i < MAX_S_PRIMES; i ++) if (count < MAX_TRACON_SET_TABLE_SIZE(s_prime[i])) return i; - assert(0, "%zu: Absurdly big count", count); - return 0; + lgdebug(+0, "Warning: %zu: Absurdly big count", count); + return -1; } +#endif -void tracon_set_reset(Tracon_set *ss) +#ifdef TRACON_SET_DEBUG +static void tracon_set_print(Tracon_set *ss) { - size_t ncount = MAX(ss->count, ss->ocount); - - /* Table sizing heuristic: The number of tracons as a function of - * word number is usually first increasing and then decreasing. - * Continue the trend of the last 2 words. */ - if (ss->count > ss->ocount) - ncount = ncount * 3 / 4; - else - ncount = ncount * 4 / 3; - unsigned int prime_idx = find_prime_for(ncount); - if (prime_idx < ss->prime_idx) ss->prime_idx = prime_idx; + if (test_enabled("tracon-set-print")) + { + clist_slot *t; - ss->size = s_prime[ss->prime_idx]; - ss->mod_func = prime_mod_func[ss->prime_idx]; + printf("tracon_set_print %p:\n", ss); + for (size_t i = 0; i < ss->size; i++) + { + t = &ss->table[i]; + if (0 == t->hash) continue; + tid_hash_t x = ss->mod_func(t->hash); + char *cstr = print_connector_list_str(t->clist, 0); + printf("[%zu]: h %zu pri %u sec %u %c %s\n", i, (size_t)t->hash, t->pri_collN, + t->sec_collN, "yn"[i == x], cstr); + free(cstr); + } + } +} + +static void tracon_set_stats(Tracon_set *a, Tracon_set *ss, const char *where) +{ + lgdebug(+D_TRACON_SET, + "%p: %s: reset %u prime_idx %u acc %zu used %2.2f%% " + "coll/acc %.4f chain %.4f\n", + a, where, ss->resetN, ss->prime_idx, ss->addN, + 100.f * ((int)MAX_TRACON_SET_TABLE_SIZE(ss->size) - ss->available_count) / ss->size, + 1.* ss->pri_collN/ss->addN, + 1. * (ss->pri_collN + ss->sec_collN) / ss->addN); +} +#else +static void tracon_set_print(Tracon_set *ss){}; +static void tracon_set_stats(Tracon_set *a, Tracon_set *ss, const char *where){}; +#endif + +void tracon_set_reset(Tracon_set *ss) +{ +#ifdef TRACON_SET_DEBUG + ss->resetN++; +#endif + tracon_set_stats(ss, ss, "reset"); + tracon_set_print(ss); memset(ss->table, 0, ss->size * sizeof(clist_slot)); - ss->ocount = ss->count; - ss->count = 0; ss->available_count = MAX_TRACON_SET_TABLE_SIZE(ss->size); } @@ -97,15 +127,19 @@ Tracon_set *tracon_set_create(void) { Tracon_set *ss = (Tracon_set *) malloc(sizeof(Tracon_set)); - ss->prime_idx = 0; + memset(ss, 0, sizeof(Tracon_set)); + // ss->prime_idx = 0; ss->size = s_prime[ss->prime_idx]; ss->mod_func = prime_mod_func[ss->prime_idx]; ss->table = (clist_slot *) malloc(ss->size * sizeof(clist_slot)); memset(ss->table, 0, ss->size * sizeof(clist_slot)); - ss->count = ss->ocount = 0; - ss->shallow = false; ss->available_count = MAX_TRACON_SET_TABLE_SIZE(ss->size); +#ifdef TRACON_SET_DEBUG + lgdebug(+D_TRACON_SET, "%p: prime_idx %u available_count %zu\n", + ss, ss->prime_idx, ss->available_count); +#endif + return ss; } @@ -128,21 +162,8 @@ static bool connector_list_equal(const Connector *c1, const Connector *c2) return (c1 == NULL) && (c2 == NULL); } -#if defined DEBUG || defined TRACON_SET_DEBUG -uint64_t fp_count; -uint64_t coll_count; -static void prt_stat(void) -{ - lgdebug(+5, "tracon_set: %"PRIu64" accesses, chain %.4f\n", - fp_count, 1.*(fp_count+coll_count)/fp_count); -} -#define PRT_STAT(...) __VA_ARGS__ -#else -#define PRT_STAT(...) -#endif - static bool place_found(const Connector *c, const clist_slot *slot, - unsigned int hash, Tracon_set *ss) + tid_hash_t hash, Tracon_set *ss) { if (slot->clist == NULL) return true; if (hash != slot->hash) return false; @@ -155,17 +176,27 @@ static bool place_found(const Connector *c, const clist_slot *slot, * lookup the given string in the table. Return an index * to the place it is, or the place where it should be. */ -static unsigned int find_place(const Connector *c, unsigned int h, - Tracon_set *ss) +static tid_hash_t find_place(const Connector *c, tid_hash_t h, + Tracon_set *ss) { - PRT_STAT(if (fp_count == 0) atexit(prt_stat); fp_count++;) unsigned int coll_num = 0; - unsigned int key = ss->mod_func(h); + tid_hash_t key = ss->mod_func(h); /* Quadratic probing. */ while (!place_found(c, &ss->table[key], h, ss)) { - PRT_STAT(coll_count++;) +#ifdef TRACON_SET_DEBUG + if (0 == coll_num) + { + ss->pri_collN++; + ss->table[key].pri_collN++; + } + else + { + ss->sec_collN++; + ss->table[key].sec_collN++; + } +#endif key += 2 * ++coll_num - 1; if (key >= ss->size) key = ss->mod_func(key); } @@ -177,7 +208,7 @@ static void grow_table(Tracon_set *ss) { Tracon_set old = *ss; - PRT_STAT(uint64_t fp_count_save = fp_count;) + tracon_set_stats(ss, &old, "before grow"); ss->prime_idx++; ss->size = s_prime[ss->prime_idx]; ss->mod_func = prime_mod_func[ss->prime_idx]; @@ -187,15 +218,14 @@ static void grow_table(Tracon_set *ss) { if (old.table[i].clist != NULL) { - unsigned int p = find_place(old.table[i].clist, old.table[i].hash, ss); + tid_hash_t p = find_place(old.table[i].clist, old.table[i].hash, ss); ss->table[p] = old.table[i]; } } ss->available_count = MAX_STRING_SET_TABLE_SIZE(ss->size) - MAX_STRING_SET_TABLE_SIZE(old.size); - /* printf("growing from %zu to %zu\n", old.size, ss->size); */ - PRT_STAT(fp_count = fp_count_save); + tracon_set_stats(ss, ss, "after grow"); free(old.table); } @@ -207,19 +237,21 @@ void tracon_set_shallow(bool shallow, Tracon_set *ss) Connector **tracon_set_add(Connector *clist, Tracon_set *ss) { assert(clist != NULL, "Can't insert a null list"); +#ifdef TRACON_SET_DEBUG + ss->addN++; +#endif /* We may need to add it to the table. If the table got too big, * first we grow it. */ if (ss->available_count == 0) grow_table(ss); - unsigned int h = hash_connectors(clist, ss->shallow); - unsigned int p = find_place(clist, h, ss); + tid_hash_t h = hash_connectors(clist, ss->shallow); + tid_hash_t p = find_place(clist, h, ss); if (ss->table[p].clist != NULL) return &ss->table[p].clist; ss->table[p].hash = h; - ss->count++; ss->available_count--; return &ss->table[p].clist; @@ -227,14 +259,16 @@ Connector **tracon_set_add(Connector *clist, Tracon_set *ss) Connector *tracon_set_lookup(const Connector *clist, Tracon_set *ss) { - unsigned int h = hash_connectors(clist, ss->shallow); - unsigned int p = find_place(clist, h, ss); + tid_hash_t h = hash_connectors(clist, ss->shallow); + tid_hash_t p = find_place(clist, h, ss); return ss->table[p].clist; } void tracon_set_delete(Tracon_set *ss) { if (ss == NULL) return; + tracon_set_stats(ss, ss, "delete"); + tracon_set_print(ss); free(ss->table); free(ss); } diff --git a/link-grammar/tracon-set.h b/link-grammar/tracon-set.h index 887418278..a8c785d95 100644 --- a/link-grammar/tracon-set.h +++ b/link-grammar/tracon-set.h @@ -18,25 +18,42 @@ #include #include "api-types.h" +#include "connectors.h" #include "const-prime.h" #include "error.h" +#ifdef DEBUG +#ifndef TRACON_SET_DEBUG +#define TRACON_SET_DEBUG +#endif +#endif + +typedef connector_hash_t tid_hash_t; typedef struct { Connector *clist; - unsigned int hash; + tid_hash_t hash; +#ifdef TRACON_SET_DEBUG + unsigned int pri_collN; + unsigned int sec_collN; +#endif } clist_slot; typedef struct { size_t size; /* the current size of the table */ - size_t count; /* number of things currently in the table */ size_t available_count; /* number of available entries */ - size_t ocount; /* the count before reset */ clist_slot *table; /* the table itself */ - unsigned int prime_idx; /* current prime number table index */ prime_mod_func_t mod_func; /* the function to compute a prime modulo */ + unsigned int prime_idx; /* current prime number table index */ bool shallow; /* consider shallow connector */ +#ifdef TRACON_SET_DEBUG + /* size_t is used here instead of uint64_t to prevent the need for PRIu64. */ + size_t addN; /* Number of tries to add */ + size_t pri_collN; /* Number of primary collisions */ + size_t sec_collN; /* Number of secondary collisions */ + unsigned int resetN; /* Number of table resets */ +#endif } Tracon_set; /* If the table gets too big, we grow it. Too big is defined as being