From 08561388cf181dc3eed416a0aeb85c28c9f4cc0b Mon Sep 17 00:00:00 2001 From: Bart Polot Date: Mon, 27 May 2013 23:35:17 +0000 Subject: Fix regexes with accepting common prefix, optimize regex prefix lengths --- src/regex/regex_test_lib.c | 185 ++++++++++++++++++++++++++++++++------------- 1 file changed, 132 insertions(+), 53 deletions(-) (limited to 'src/regex/regex_test_lib.c') diff --git a/src/regex/regex_test_lib.c b/src/regex/regex_test_lib.c index b4c7278c0..df873c112 100644 --- a/src/regex/regex_test_lib.c +++ b/src/regex/regex_test_lib.c @@ -60,29 +60,30 @@ struct RegexCombineCtx { char *s; }; -// static void -// space (int n) -// { -// int i; -// for (i = 0; i < n; i++) -// printf (" "); -// } -// -// static void -// debugctx (struct RegexCombineCtx *ctx, int level) -// { -// struct RegexCombineCtx *p; -// space (level); -// if (NULL != ctx->s) -// printf ("'%s'\n", ctx->s); -// else -// printf ("NULL\n"); -// for (p = ctx->head; NULL != p; p = p->next) -// { -// debugctx (p, level + 1); -// } -// } +/* +static void +space (int n) +{ + int i; + for (i = 0; i < n; i++) + printf (" "); +} +static void +debugctx (struct RegexCombineCtx *ctx, int level) +{ + struct RegexCombineCtx *p; + space (level); + if (NULL != ctx->s) + printf ("'%s'\n", ctx->s); + else + printf ("NULL\n"); + for (p = ctx->head; NULL != p; p = p->next) + { + debugctx (p, level + 1); + } +} +*/ /** * Extract a string from all prefix-combined regexes. @@ -110,7 +111,9 @@ regex_combine (struct RegexCombineCtx *ctx) s = regex_combine (p); GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " total '%s'\n", s); if (strlen(s) == 0) + { opt = GNUNET_YES; + } else { GNUNET_asprintf (&tmp, "%s%s|", regex, s); @@ -136,7 +139,7 @@ regex_combine (struct RegexCombineCtx *ctx) if (NULL != ctx->s) { if (opt) - GNUNET_asprintf (&s, "%s[%s]", ctx->s, regex); + GNUNET_asprintf (&s, "%s(%s)?", ctx->s, regex); else GNUNET_asprintf (&s, "%s(%s)", ctx->s, regex); GNUNET_free (regex); @@ -148,6 +151,68 @@ regex_combine (struct RegexCombineCtx *ctx) } +/** + * Get the number of matching characters on the prefix of both strings. + * + * @param s1 String 1. + * @param s2 String 2. + * + * @return Number of characters of matching prefix. + */ +static unsigned int +get_prefix_length (const char *s1, const char *s2) +{ + unsigned int l1; + unsigned int l2; + unsigned int limit; + unsigned int i; + + l1 = strlen (s1); + l2 = strlen (s2); + limit = l1 > l2 ? l2 : l1; + + for (i = 1; i <= limit; i++) + { + if (0 != strncmp (s1, s2, i)) + return i - 1; + } + return limit; +} + + +/** + * Return the child context with the longest prefix match with the regex. + * Usually only one child will match, search all just in case. + * + * @param ctx Context whose children to search. + * @param regex String to match. + * + * @return Child with the longest prefix, NULL if no child matches. + */ +static struct RegexCombineCtx * +get_longest_prefix (struct RegexCombineCtx *ctx, const char *regex) +{ + struct RegexCombineCtx *p; + struct RegexCombineCtx *best; + unsigned int l; + unsigned int best_l; + + best_l = 0; + best = NULL; + for (p = ctx->head; NULL != p; p = p->next) + { + l = get_prefix_length (p->s, regex); + if (l > best_l) + { + GNUNET_break (0 == best_l); + best = p; + best_l = l; + } + } + return best; +} + + /** * Add a single regex to a context, combining with exisiting regex by-prefix. * @@ -158,42 +223,55 @@ static void regex_add (struct RegexCombineCtx *ctx, const char *regex) { struct RegexCombineCtx *p; - const char *rest; + struct RegexCombineCtx *newctx; + unsigned int prefix_l; + const char *rest_r; + const char *rest_s; + size_t len; - rest = ®ex[1]; - for (p = ctx->head; NULL != p; p = p->next) + if (0 == strlen (regex)) + return; + + p = get_longest_prefix (ctx, regex); + if (NULL != p) /* There is some prefix match, reduce regex and try again */ { - if (p->s[0] == regex[0]) + prefix_l = get_prefix_length (p->s, regex); + rest_s = &p->s[prefix_l]; + rest_r = ®ex[prefix_l]; + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "chosen '%s' [%u]\n", p->s, prefix_l); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "prefix r '%.*s'\n", prefix_l, p->s); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest r '%s'\n", rest_r); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "rest s '%s'\n", rest_s); + len = strlen (p->s); + if (prefix_l < len) /* only partial match, split existing state */ { - if (1 == strlen(p->s)) - { - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "common char %s\n", p->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "adding %s\n", rest); - regex_add (p, rest); - } - else - { - struct RegexCombineCtx *newctx; - newctx = GNUNET_malloc (sizeof (struct RegexCombineCtx)); - newctx->s = GNUNET_strdup (&p->s[1]); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " p has now %s\n", p->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " p will have %.1s\n", p->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " regex is %s\n", regex); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new has now %s\n", newctx->s); - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " rest is now %s\n", rest); - p->s[1] = '\0'; /* dont realloc */ - GNUNET_CONTAINER_DLL_insert (p->head, p->tail, newctx); - regex_add (p, rest); - } - return; + newctx = GNUNET_malloc (sizeof (struct RegexCombineCtx)); + newctx->head = p->head; + newctx->tail = p->tail; + newctx->s = GNUNET_malloc(len - prefix_l + 1); + strncpy (newctx->s, rest_s, len - prefix_l + 1); + + p->head = newctx; + p->tail = newctx; + p->s[prefix_l] = '\0'; } + regex_add (p, rest_r); + return; + } + /* There is no prefix match, add new */ + if (NULL == ctx->head && NULL != ctx->s) + { + /* this was the end before, add empty string */ + newctx = GNUNET_malloc (sizeof (struct RegexCombineCtx)); + newctx->s = GNUNET_strdup (""); + GNUNET_CONTAINER_DLL_insert (ctx->head, ctx->tail, newctx); } - GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n"); + GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " no match\n"); GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " new state %s\n", regex); GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, " under %s\n", ctx->s); - p = GNUNET_malloc (sizeof (struct RegexCombineCtx)); - p->s = GNUNET_strdup (regex); - GNUNET_CONTAINER_DLL_insert (ctx->head, ctx->tail, p); + newctx = GNUNET_malloc (sizeof (struct RegexCombineCtx)); + newctx->s = GNUNET_strdup (regex); + GNUNET_CONTAINER_DLL_insert (ctx->head, ctx->tail, newctx); } @@ -240,9 +318,10 @@ GNUNET_REGEX_combine (char * const regexes[]) current = regexes[i]; GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Regex %u: %s\n", i, current); regex_add (ctx, current); + /* debugctx (ctx, 0); */ } GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "\nCombining...\n"); - //debugctx (ctx, 0); + /* debugctx (ctx, 0); */ combined = regex_combine (ctx); -- cgit v1.2.3